2021-06-14T15:43:39+08:00


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_E3YUGV"
Parsing file _cuobjdump_complete_output_E3YUGV
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_d9VRdA"
Running: cat _ptx_d9VRdA | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_GPyPKe
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_GPyPKe --output-file  /dev/null 2> _ptx_d9VRdAinfo"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_d9VRdA _ptx2_GPyPKe _ptx_d9VRdAinfo"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:43:42 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 1500  inst.: 305100 (ipc=203.4) sim_rate=152550 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:43:43 2021
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2500  inst.: 473544 (ipc=189.4) sim_rate=157848 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:43:44 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 3500  inst.: 874228 (ipc=249.8) sim_rate=218557 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:43:45 2021
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 4000  inst.: 1092420 (ipc=273.1) sim_rate=218484 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:43:46 2021
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 5000  inst.: 1472608 (ipc=294.5) sim_rate=245434 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:43:47 2021
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 6000  inst.: 1825436 (ipc=304.2) sim_rate=260776 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:43:48 2021
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 7000  inst.: 2181760 (ipc=311.7) sim_rate=272720 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:43:49 2021
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 8000  inst.: 2555524 (ipc=319.4) sim_rate=283947 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:43:50 2021
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 9000  inst.: 2895792 (ipc=321.8) sim_rate=289579 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:43:51 2021
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 10000  inst.: 3257748 (ipc=325.8) sim_rate=296158 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:43:52 2021
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 11000  inst.: 3600988 (ipc=327.4) sim_rate=300082 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:43:53 2021
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 11500  inst.: 3804716 (ipc=330.8) sim_rate=292670 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:43:54 2021
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 12500  inst.: 4159952 (ipc=332.8) sim_rate=297139 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:43:55 2021
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 13500  inst.: 4478000 (ipc=331.7) sim_rate=298533 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:43:56 2021
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 14500  inst.: 4845224 (ipc=334.2) sim_rate=302826 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:43:57 2021
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 15500  inst.: 5198688 (ipc=335.4) sim_rate=305805 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:43:58 2021
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 16500  inst.: 5561548 (ipc=337.1) sim_rate=308974 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:43:59 2021
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 17500  inst.: 5922888 (ipc=338.5) sim_rate=311730 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:44:00 2021
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 18500  inst.: 6276544 (ipc=339.3) sim_rate=313827 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:44:01 2021
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 19500  inst.: 6644036 (ipc=340.7) sim_rate=316382 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:44:02 2021
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 20000  inst.: 6825636 (ipc=341.3) sim_rate=310256 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:44:03 2021
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 21000  inst.: 7175100 (ipc=341.7) sim_rate=311960 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:44:04 2021
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 22000  inst.: 7529316 (ipc=342.2) sim_rate=313721 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:44:05 2021
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 23000  inst.: 7901844 (ipc=343.6) sim_rate=316073 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:44:06 2021
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim uArch: cycles simulated: 24000  inst.: 8258968 (ipc=344.1) sim_rate=317652 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:44:07 2021
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 25000  inst.: 8616376 (ipc=344.7) sim_rate=319125 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:44:08 2021
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 26000  inst.: 8970768 (ipc=345.0) sim_rate=320384 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:44:09 2021
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 27000  inst.: 9350236 (ipc=346.3) sim_rate=322421 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:44:10 2021
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28000  inst.: 9687816 (ipc=346.0) sim_rate=322927 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:44:11 2021
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 29000  inst.: 10051864 (ipc=346.6) sim_rate=324253 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:44:12 2021
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 30000  inst.: 10408808 (ipc=347.0) sim_rate=325275 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:44:13 2021
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 30500  inst.: 10571584 (ipc=346.6) sim_rate=320351 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:44:14 2021
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 31500  inst.: 10941032 (ipc=347.3) sim_rate=321795 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:44:15 2021
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 32500  inst.: 11300316 (ipc=347.7) sim_rate=322866 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:44:16 2021
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 33500  inst.: 11664956 (ipc=348.2) sim_rate=324026 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:44:17 2021
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 34500  inst.: 12026356 (ipc=348.6) sim_rate=325036 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:44:18 2021
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 35500  inst.: 12395792 (ipc=349.2) sim_rate=326205 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:44:19 2021
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 36500  inst.: 12743812 (ipc=349.1) sim_rate=326764 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:44:20 2021
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 37500  inst.: 13091280 (ipc=349.1) sim_rate=327282 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:44:21 2021
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 38500  inst.: 13449032 (ipc=349.3) sim_rate=328025 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:44:22 2021
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 39500  inst.: 13803876 (ipc=349.5) sim_rate=328663 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:44:23 2021
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 40500  inst.: 14167076 (ipc=349.8) sim_rate=329466 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:44:24 2021
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 41500  inst.: 14512928 (ipc=349.7) sim_rate=329839 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:44:25 2021
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 42000  inst.: 14687016 (ipc=349.7) sim_rate=326378 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:44:26 2021
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 43000  inst.: 15056520 (ipc=350.2) sim_rate=327315 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:44:27 2021
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 44000  inst.: 15410888 (ipc=350.2) sim_rate=327891 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:44:28 2021
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 45000  inst.: 15772716 (ipc=350.5) sim_rate=328598 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:44:29 2021
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 46000  inst.: 16147168 (ipc=351.0) sim_rate=329534 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:44:30 2021
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 47000  inst.: 16499776 (ipc=351.1) sim_rate=329995 (inst/sec) elapsed = 0:0:00:50 / Mon Jun 14 15:44:31 2021
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: cycles simulated: 48000  inst.: 16631860 (ipc=346.5) sim_rate=326114 (inst/sec) elapsed = 0:0:00:51 / Mon Jun 14 15:44:32 2021
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=326117

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 51 sec (51 sec)
gpgpu_simulation_rate = 326117 (inst/sec)
gpgpu_simulation_rate = 946 (cycle/sec)
total time is 50181 ms


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_pNOLTc"
Parsing file _cuobjdump_complete_output_pNOLTc
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_I7Fn0g"
Running: cat _ptx_I7Fn0g | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_Bad06k
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_Bad06k --output-file  /dev/null 2> _ptx_I7Fn0ginfo"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_I7Fn0g _ptx2_Bad06k _ptx_I7Fn0ginfo"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:44:35 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 1500  inst.: 305100 (ipc=203.4) sim_rate=152550 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:44:36 2021
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2500  inst.: 473544 (ipc=189.4) sim_rate=157848 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:44:37 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 3500  inst.: 874228 (ipc=249.8) sim_rate=218557 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:44:38 2021
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 4500  inst.: 1279928 (ipc=284.4) sim_rate=255985 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:44:39 2021
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 5500  inst.: 1658768 (ipc=301.6) sim_rate=276461 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:44:40 2021
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 6000  inst.: 1825436 (ipc=304.2) sim_rate=260776 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:44:41 2021
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 7000  inst.: 2181760 (ipc=311.7) sim_rate=272720 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:44:42 2021
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 8000  inst.: 2555524 (ipc=319.4) sim_rate=283947 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:44:43 2021
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 9000  inst.: 2895792 (ipc=321.8) sim_rate=289579 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:44:44 2021
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 10000  inst.: 3257748 (ipc=325.8) sim_rate=296158 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:44:45 2021
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 11000  inst.: 3600988 (ipc=327.4) sim_rate=300082 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:44:46 2021
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 12000  inst.: 3965588 (ipc=330.5) sim_rate=305045 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:44:47 2021
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 13000  inst.: 4315724 (ipc=332.0) sim_rate=308266 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:44:48 2021
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 14000  inst.: 4667380 (ipc=333.4) sim_rate=311158 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:44:49 2021
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 15000  inst.: 5017508 (ipc=334.5) sim_rate=313594 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:44:50 2021
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 15500  inst.: 5198688 (ipc=335.4) sim_rate=305805 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:44:51 2021
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 16500  inst.: 5561548 (ipc=337.1) sim_rate=308974 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:44:52 2021
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 17500  inst.: 5922888 (ipc=338.5) sim_rate=311730 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:44:53 2021
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 18500  inst.: 6276544 (ipc=339.3) sim_rate=313827 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:44:54 2021
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 19500  inst.: 6644036 (ipc=340.7) sim_rate=316382 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:44:55 2021
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 20500  inst.: 7001524 (ipc=341.5) sim_rate=318251 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:44:56 2021
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim uArch: cycles simulated: 21500  inst.: 7362756 (ipc=342.5) sim_rate=320119 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:44:57 2021
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 22500  inst.: 7729204 (ipc=343.5) sim_rate=322050 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:44:58 2021
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 23500  inst.: 8076140 (ipc=343.7) sim_rate=323045 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:44:59 2021
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 24500  inst.: 8433512 (ipc=344.2) sim_rate=324365 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:45:00 2021
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 25000  inst.: 8616376 (ipc=344.7) sim_rate=319125 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:45:01 2021
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 26000  inst.: 8970768 (ipc=345.0) sim_rate=320384 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:45:02 2021
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 27000  inst.: 9350236 (ipc=346.3) sim_rate=322421 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:45:03 2021
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28000  inst.: 9687816 (ipc=346.0) sim_rate=322927 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:45:04 2021
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 29000  inst.: 10051864 (ipc=346.6) sim_rate=324253 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:45:05 2021
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 30000  inst.: 10408808 (ipc=347.0) sim_rate=325275 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:45:06 2021
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 31000  inst.: 10769580 (ipc=347.4) sim_rate=326350 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:45:07 2021
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 32000  inst.: 11110308 (ipc=347.2) sim_rate=326773 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:45:08 2021
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 33000  inst.: 11488720 (ipc=348.1) sim_rate=328249 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:45:09 2021
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 33500  inst.: 11664956 (ipc=348.2) sim_rate=324026 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:45:10 2021
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 34500  inst.: 12026356 (ipc=348.6) sim_rate=325036 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:45:11 2021
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 35500  inst.: 12395792 (ipc=349.2) sim_rate=326205 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:45:12 2021
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 36500  inst.: 12743812 (ipc=349.1) sim_rate=326764 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:45:13 2021
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 37500  inst.: 13091280 (ipc=349.1) sim_rate=327282 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:45:14 2021
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 38500  inst.: 13449032 (ipc=349.3) sim_rate=328025 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:45:15 2021
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 39500  inst.: 13803876 (ipc=349.5) sim_rate=328663 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:45:16 2021
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 40500  inst.: 14167076 (ipc=349.8) sim_rate=329466 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:45:17 2021
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 41500  inst.: 14512928 (ipc=349.7) sim_rate=329839 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:45:18 2021
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 42500  inst.: 14871404 (ipc=349.9) sim_rate=330475 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:45:19 2021
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 43500  inst.: 15233248 (ipc=350.2) sim_rate=331157 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:45:20 2021
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 44500  inst.: 15586264 (ipc=350.3) sim_rate=331622 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:45:21 2021
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 45500  inst.: 15949040 (ipc=350.5) sim_rate=332271 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:45:22 2021
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 46500  inst.: 16323188 (ipc=351.0) sim_rate=333126 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:45:23 2021
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 47000  inst.: 16499776 (ipc=351.1) sim_rate=329995 (inst/sec) elapsed = 0:0:00:50 / Mon Jun 14 15:45:24 2021
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=332640

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 50 sec (50 sec)
gpgpu_simulation_rate = 332640 (inst/sec)
gpgpu_simulation_rate = 965 (cycle/sec)
total time is 50182 ms


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_w6RpSs"
Parsing file _cuobjdump_complete_output_w6RpSs
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_vgHFyW"
Running: cat _ptx_vgHFyW | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_sz9Veq
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_sz9Veq --output-file  /dev/null 2> _ptx_vgHFyWinfo"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_vgHFyW _ptx2_sz9Veq _ptx_vgHFyWinfo"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:45:28 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 1500  inst.: 305100 (ipc=203.4) sim_rate=152550 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:45:29 2021
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2500  inst.: 473544 (ipc=189.4) sim_rate=157848 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:45:30 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 3500  inst.: 874228 (ipc=249.8) sim_rate=218557 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:45:31 2021
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 4500  inst.: 1279928 (ipc=284.4) sim_rate=255985 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:45:32 2021
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 5500  inst.: 1658768 (ipc=301.6) sim_rate=276461 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:45:33 2021
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 6500  inst.: 2010508 (ipc=309.3) sim_rate=287215 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:45:34 2021
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 7500  inst.: 2365768 (ipc=315.4) sim_rate=295721 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:45:35 2021
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 8500  inst.: 2717496 (ipc=319.7) sim_rate=301944 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:45:36 2021
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 9500  inst.: 3069396 (ipc=323.1) sim_rate=306939 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:45:37 2021
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 10000  inst.: 3257748 (ipc=325.8) sim_rate=296158 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:45:38 2021
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 11000  inst.: 3600988 (ipc=327.4) sim_rate=300082 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:45:39 2021
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 12000  inst.: 3965588 (ipc=330.5) sim_rate=305045 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:45:40 2021
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 13000  inst.: 4315724 (ipc=332.0) sim_rate=308266 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:45:41 2021
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 14000  inst.: 4667380 (ipc=333.4) sim_rate=311158 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:45:42 2021
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 15000  inst.: 5017508 (ipc=334.5) sim_rate=313594 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:45:43 2021
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 16000  inst.: 5386128 (ipc=336.6) sim_rate=316831 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:45:44 2021
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 17000  inst.: 5743808 (ipc=337.9) sim_rate=319100 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:45:45 2021
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 18000  inst.: 6097268 (ipc=338.7) sim_rate=320908 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:45:46 2021
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 19000  inst.: 6468436 (ipc=340.4) sim_rate=323421 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:45:47 2021
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 20000  inst.: 6825636 (ipc=341.3) sim_rate=325030 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:45:48 2021
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 21000  inst.: 7175100 (ipc=341.7) sim_rate=326140 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:45:49 2021
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 22000  inst.: 7529316 (ipc=342.2) sim_rate=327361 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:45:50 2021
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 23000  inst.: 7901844 (ipc=343.6) sim_rate=329243 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:45:51 2021
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 23500  inst.: 8076140 (ipc=343.7) sim_rate=323045 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:45:52 2021
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 24500  inst.: 8433512 (ipc=344.2) sim_rate=324365 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:45:53 2021
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 25500  inst.: 8795428 (ipc=344.9) sim_rate=325756 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:45:54 2021
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim uArch: cycles simulated: 26500  inst.: 9158844 (ipc=345.6) sim_rate=327101 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:45:55 2021
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 27500  inst.: 9507160 (ipc=345.7) sim_rate=327833 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:45:56 2021
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28500  inst.: 9864520 (ipc=346.1) sim_rate=328817 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:45:57 2021
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 29500  inst.: 10222816 (ipc=346.5) sim_rate=329768 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:45:58 2021
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 30500  inst.: 10571584 (ipc=346.6) sim_rate=330362 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:45:59 2021
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 31500  inst.: 10941032 (ipc=347.3) sim_rate=331546 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:46:00 2021
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 32500  inst.: 11300316 (ipc=347.7) sim_rate=332362 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:46:01 2021
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 33500  inst.: 11664956 (ipc=348.2) sim_rate=333284 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:46:02 2021
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 34500  inst.: 12026356 (ipc=348.6) sim_rate=334065 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:46:03 2021
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 35500  inst.: 12395792 (ipc=349.2) sim_rate=335021 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:46:04 2021
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 36500  inst.: 12743812 (ipc=349.1) sim_rate=335363 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:46:05 2021
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 37500  inst.: 13091280 (ipc=349.1) sim_rate=335673 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:46:06 2021
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 38500  inst.: 13449032 (ipc=349.3) sim_rate=336225 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:46:07 2021
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 39500  inst.: 13803876 (ipc=349.5) sim_rate=336679 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:46:08 2021
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 40500  inst.: 14167076 (ipc=349.8) sim_rate=337311 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:46:09 2021
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 41500  inst.: 14512928 (ipc=349.7) sim_rate=337509 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:46:10 2021
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 42500  inst.: 14871404 (ipc=349.9) sim_rate=337986 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:46:11 2021
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 43500  inst.: 15233248 (ipc=350.2) sim_rate=338516 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:46:12 2021
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 44000  inst.: 15410888 (ipc=350.2) sim_rate=335019 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:46:13 2021
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 45000  inst.: 15772716 (ipc=350.5) sim_rate=335589 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:46:14 2021
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 46000  inst.: 16147168 (ipc=351.0) sim_rate=336399 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:46:15 2021
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 47000  inst.: 16499776 (ipc=351.1) sim_rate=336730 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:46:16 2021
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=332640

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 50 sec (50 sec)
gpgpu_simulation_rate = 332640 (inst/sec)
gpgpu_simulation_rate = 965 (cycle/sec)
total time is 49281 ms


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_Pe3VEJ"
Parsing file _cuobjdump_complete_output_Pe3VEJ
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_XoC8lA"
Running: cat _ptx_XoC8lA | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_X4Pl3q
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_X4Pl3q --output-file  /dev/null 2> _ptx_XoC8lAinfo"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_XoC8lA _ptx2_X4Pl3q _ptx_XoC8lAinfo"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:46:20 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 1500  inst.: 305100 (ipc=203.4) sim_rate=152550 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:46:21 2021
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2500  inst.: 473544 (ipc=189.4) sim_rate=157848 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:46:22 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 3500  inst.: 874228 (ipc=249.8) sim_rate=218557 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:46:23 2021
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 4500  inst.: 1279928 (ipc=284.4) sim_rate=255985 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:46:24 2021
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 5500  inst.: 1658768 (ipc=301.6) sim_rate=276461 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:46:25 2021
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 6500  inst.: 2010508 (ipc=309.3) sim_rate=287215 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:46:26 2021
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 7500  inst.: 2365768 (ipc=315.4) sim_rate=295721 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:46:27 2021
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 8500  inst.: 2717496 (ipc=319.7) sim_rate=301944 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:46:28 2021
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 9500  inst.: 3069396 (ipc=323.1) sim_rate=306939 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:46:29 2021
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim uArch: cycles simulated: 10500  inst.: 3416504 (ipc=325.4) sim_rate=310591 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:46:30 2021
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 11500  inst.: 3804716 (ipc=330.8) sim_rate=317059 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:46:31 2021
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 12000  inst.: 3965588 (ipc=330.5) sim_rate=305045 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:46:32 2021
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 13000  inst.: 4315724 (ipc=332.0) sim_rate=308266 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:46:33 2021
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 14000  inst.: 4667380 (ipc=333.4) sim_rate=311158 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:46:34 2021
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 15000  inst.: 5017508 (ipc=334.5) sim_rate=313594 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:46:35 2021
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 16000  inst.: 5386128 (ipc=336.6) sim_rate=316831 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:46:36 2021
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 17000  inst.: 5743808 (ipc=337.9) sim_rate=319100 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:46:37 2021
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 18000  inst.: 6097268 (ipc=338.7) sim_rate=320908 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:46:38 2021
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 19000  inst.: 6468436 (ipc=340.4) sim_rate=323421 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:46:39 2021
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 20000  inst.: 6825636 (ipc=341.3) sim_rate=325030 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:46:40 2021
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 21000  inst.: 7175100 (ipc=341.7) sim_rate=326140 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:46:41 2021
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 22000  inst.: 7529316 (ipc=342.2) sim_rate=327361 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:46:42 2021
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 23000  inst.: 7901844 (ipc=343.6) sim_rate=329243 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:46:43 2021
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim uArch: cycles simulated: 24000  inst.: 8258968 (ipc=344.1) sim_rate=330358 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:46:44 2021
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 25000  inst.: 8616376 (ipc=344.7) sim_rate=331399 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:46:45 2021
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 26000  inst.: 8970768 (ipc=345.0) sim_rate=332250 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:46:46 2021
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 27000  inst.: 9350236 (ipc=346.3) sim_rate=333937 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:46:47 2021
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28000  inst.: 9687816 (ipc=346.0) sim_rate=334062 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:46:48 2021
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 29000  inst.: 10051864 (ipc=346.6) sim_rate=335062 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:46:49 2021
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 29500  inst.: 10222816 (ipc=346.5) sim_rate=329768 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:46:50 2021
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 30500  inst.: 10571584 (ipc=346.6) sim_rate=330362 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:46:51 2021
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 31500  inst.: 10941032 (ipc=347.3) sim_rate=331546 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:46:52 2021
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 32500  inst.: 11300316 (ipc=347.7) sim_rate=332362 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:46:53 2021
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 33500  inst.: 11664956 (ipc=348.2) sim_rate=333284 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:46:54 2021
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 34500  inst.: 12026356 (ipc=348.6) sim_rate=334065 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:46:55 2021
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 35500  inst.: 12395792 (ipc=349.2) sim_rate=335021 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:46:56 2021
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 36500  inst.: 12743812 (ipc=349.1) sim_rate=335363 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:46:57 2021
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 37500  inst.: 13091280 (ipc=349.1) sim_rate=335673 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:46:58 2021
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 38500  inst.: 13449032 (ipc=349.3) sim_rate=336225 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:46:59 2021
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 39500  inst.: 13803876 (ipc=349.5) sim_rate=336679 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:47:00 2021
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 40500  inst.: 14167076 (ipc=349.8) sim_rate=337311 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:47:01 2021
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 41500  inst.: 14512928 (ipc=349.7) sim_rate=337509 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:47:02 2021
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 42500  inst.: 14871404 (ipc=349.9) sim_rate=337986 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:47:03 2021
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 43500  inst.: 15233248 (ipc=350.2) sim_rate=338516 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:47:04 2021
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 44000  inst.: 15410888 (ipc=350.2) sim_rate=335019 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:47:05 2021
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 45000  inst.: 15772716 (ipc=350.5) sim_rate=335589 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:47:06 2021
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 46000  inst.: 16147168 (ipc=351.0) sim_rate=336399 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:47:07 2021
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 47000  inst.: 16499776 (ipc=351.1) sim_rate=336730 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:47:08 2021
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: cycles simulated: 48000  inst.: 16631860 (ipc=346.5) sim_rate=332637 (inst/sec) elapsed = 0:0:00:50 / Mon Jun 14 15:47:09 2021
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=332640

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 50 sec (50 sec)
gpgpu_simulation_rate = 332640 (inst/sec)
gpgpu_simulation_rate = 965 (cycle/sec)
total time is 49482 ms


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_y590Mi"
Parsing file _cuobjdump_complete_output_y590Mi
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_js3a4w"
Running: cat _ptx_js3a4w | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_OiAllL
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_OiAllL --output-file  /dev/null 2> _ptx_js3a4winfo"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_js3a4w _ptx2_OiAllL _ptx_js3a4winfo"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:47:12 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 1500  inst.: 305100 (ipc=203.4) sim_rate=152550 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:47:13 2021
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2500  inst.: 473544 (ipc=189.4) sim_rate=157848 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:47:14 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 3500  inst.: 874228 (ipc=249.8) sim_rate=218557 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:47:15 2021
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 4500  inst.: 1279928 (ipc=284.4) sim_rate=255985 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:47:16 2021
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 5500  inst.: 1658768 (ipc=301.6) sim_rate=276461 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:47:17 2021
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 6000  inst.: 1825436 (ipc=304.2) sim_rate=260776 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:47:18 2021
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 7000  inst.: 2181760 (ipc=311.7) sim_rate=272720 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:47:19 2021
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 8000  inst.: 2555524 (ipc=319.4) sim_rate=283947 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:47:20 2021
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 9000  inst.: 2895792 (ipc=321.8) sim_rate=289579 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:47:21 2021
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 10000  inst.: 3257748 (ipc=325.8) sim_rate=296158 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:47:22 2021
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 11000  inst.: 3600988 (ipc=327.4) sim_rate=300082 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:47:23 2021
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 12000  inst.: 3965588 (ipc=330.5) sim_rate=305045 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:47:24 2021
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 13000  inst.: 4315724 (ipc=332.0) sim_rate=308266 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:47:25 2021
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 14000  inst.: 4667380 (ipc=333.4) sim_rate=311158 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:47:26 2021
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 15000  inst.: 5017508 (ipc=334.5) sim_rate=313594 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:47:27 2021
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 15500  inst.: 5198688 (ipc=335.4) sim_rate=305805 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:47:28 2021
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 16500  inst.: 5561548 (ipc=337.1) sim_rate=308974 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:47:29 2021
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 17500  inst.: 5922888 (ipc=338.5) sim_rate=311730 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:47:30 2021
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 18500  inst.: 6276544 (ipc=339.3) sim_rate=313827 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:47:31 2021
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 19500  inst.: 6644036 (ipc=340.7) sim_rate=316382 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:47:32 2021
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 20500  inst.: 7001524 (ipc=341.5) sim_rate=318251 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:47:33 2021
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim uArch: cycles simulated: 21500  inst.: 7362756 (ipc=342.5) sim_rate=320119 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:47:34 2021
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 22500  inst.: 7729204 (ipc=343.5) sim_rate=322050 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:47:35 2021
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 23500  inst.: 8076140 (ipc=343.7) sim_rate=323045 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:47:36 2021
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 24500  inst.: 8433512 (ipc=344.2) sim_rate=324365 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:47:37 2021
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 25500  inst.: 8795428 (ipc=344.9) sim_rate=325756 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:47:38 2021
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 26000  inst.: 8970768 (ipc=345.0) sim_rate=320384 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:47:39 2021
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 27000  inst.: 9350236 (ipc=346.3) sim_rate=322421 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:47:40 2021
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28000  inst.: 9687816 (ipc=346.0) sim_rate=322927 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:47:41 2021
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 29000  inst.: 10051864 (ipc=346.6) sim_rate=324253 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:47:42 2021
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 30000  inst.: 10408808 (ipc=347.0) sim_rate=325275 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:47:43 2021
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 31000  inst.: 10769580 (ipc=347.4) sim_rate=326350 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:47:44 2021
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 32000  inst.: 11110308 (ipc=347.2) sim_rate=326773 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:47:45 2021
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 33000  inst.: 11488720 (ipc=348.1) sim_rate=328249 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:47:46 2021
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim uArch: cycles simulated: 34000  inst.: 11823760 (ipc=347.8) sim_rate=328437 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:47:47 2021
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 35000  inst.: 12216676 (ipc=349.0) sim_rate=330180 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:47:48 2021
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 36000  inst.: 12558896 (ipc=348.9) sim_rate=330497 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:47:49 2021
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 36500  inst.: 12743812 (ipc=349.1) sim_rate=326764 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:47:50 2021
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 37500  inst.: 13091280 (ipc=349.1) sim_rate=327282 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:47:51 2021
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 38500  inst.: 13449032 (ipc=349.3) sim_rate=328025 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:47:52 2021
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 39500  inst.: 13803876 (ipc=349.5) sim_rate=328663 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:47:53 2021
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 40500  inst.: 14167076 (ipc=349.8) sim_rate=329466 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:47:54 2021
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 41500  inst.: 14512928 (ipc=349.7) sim_rate=329839 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:47:55 2021
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 42500  inst.: 14871404 (ipc=349.9) sim_rate=330475 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:47:56 2021
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 43500  inst.: 15233248 (ipc=350.2) sim_rate=331157 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:47:57 2021
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 44500  inst.: 15586264 (ipc=350.3) sim_rate=331622 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:47:58 2021
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 45500  inst.: 15949040 (ipc=350.5) sim_rate=332271 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:47:59 2021
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 46500  inst.: 16323188 (ipc=351.0) sim_rate=333126 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:48:00 2021
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: cycles simulated: 47500  inst.: 16627484 (ipc=350.1) sim_rate=332549 (inst/sec) elapsed = 0:0:00:50 / Mon Jun 14 15:48:01 2021
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=332640

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 50 sec (50 sec)
gpgpu_simulation_rate = 332640 (inst/sec)
gpgpu_simulation_rate = 965 (cycle/sec)
total time is 50082 ms


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_lUYlYs"
Parsing file _cuobjdump_complete_output_lUYlYs
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_kCQYv6"
Running: cat _ptx_kCQYv6 | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_DDmC3J
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_DDmC3J --output-file  /dev/null 2> _ptx_kCQYv6info"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_kCQYv6 _ptx2_DDmC3J _ptx_kCQYv6info"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:48:05 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 1500  inst.: 305100 (ipc=203.4) sim_rate=152550 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:48:06 2021
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2500  inst.: 473544 (ipc=189.4) sim_rate=157848 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:48:07 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 3500  inst.: 874228 (ipc=249.8) sim_rate=218557 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:48:08 2021
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 4500  inst.: 1279928 (ipc=284.4) sim_rate=255985 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:48:09 2021
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 5500  inst.: 1658768 (ipc=301.6) sim_rate=276461 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:48:10 2021
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 6500  inst.: 2010508 (ipc=309.3) sim_rate=287215 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:48:11 2021
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 7500  inst.: 2365768 (ipc=315.4) sim_rate=295721 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:48:12 2021
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 8500  inst.: 2717496 (ipc=319.7) sim_rate=301944 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:48:13 2021
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 9500  inst.: 3069396 (ipc=323.1) sim_rate=306939 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:48:14 2021
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 10000  inst.: 3257748 (ipc=325.8) sim_rate=296158 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:48:15 2021
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 11000  inst.: 3600988 (ipc=327.4) sim_rate=300082 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:48:16 2021
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 12000  inst.: 3965588 (ipc=330.5) sim_rate=305045 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:48:17 2021
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 13000  inst.: 4315724 (ipc=332.0) sim_rate=308266 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:48:18 2021
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 14000  inst.: 4667380 (ipc=333.4) sim_rate=311158 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:48:19 2021
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 15000  inst.: 5017508 (ipc=334.5) sim_rate=313594 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:48:20 2021
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 16000  inst.: 5386128 (ipc=336.6) sim_rate=316831 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:48:21 2021
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 17000  inst.: 5743808 (ipc=337.9) sim_rate=319100 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:48:22 2021
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 18000  inst.: 6097268 (ipc=338.7) sim_rate=320908 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:48:23 2021
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 19000  inst.: 6468436 (ipc=340.4) sim_rate=323421 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:48:24 2021
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 20000  inst.: 6825636 (ipc=341.3) sim_rate=325030 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:48:25 2021
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 20500  inst.: 7001524 (ipc=341.5) sim_rate=318251 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:48:26 2021
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim uArch: cycles simulated: 21500  inst.: 7362756 (ipc=342.5) sim_rate=320119 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:48:27 2021
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 22500  inst.: 7729204 (ipc=343.5) sim_rate=322050 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:48:28 2021
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 23500  inst.: 8076140 (ipc=343.7) sim_rate=323045 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:48:29 2021
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 24500  inst.: 8433512 (ipc=344.2) sim_rate=324365 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:48:30 2021
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 25500  inst.: 8795428 (ipc=344.9) sim_rate=325756 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:48:31 2021
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim uArch: cycles simulated: 26500  inst.: 9158844 (ipc=345.6) sim_rate=327101 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:48:32 2021
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 27500  inst.: 9507160 (ipc=345.7) sim_rate=327833 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:48:33 2021
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28500  inst.: 9864520 (ipc=346.1) sim_rate=328817 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:48:34 2021
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 29500  inst.: 10222816 (ipc=346.5) sim_rate=329768 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:48:35 2021
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 30500  inst.: 10571584 (ipc=346.6) sim_rate=330362 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:48:36 2021
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 31500  inst.: 10941032 (ipc=347.3) sim_rate=331546 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:48:37 2021
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim uArch: cycles simulated: 32500  inst.: 11300316 (ipc=347.7) sim_rate=332362 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:48:38 2021
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 33500  inst.: 11664956 (ipc=348.2) sim_rate=333284 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:48:39 2021
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 34500  inst.: 12026356 (ipc=348.6) sim_rate=334065 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:48:40 2021
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 35000  inst.: 12216676 (ipc=349.0) sim_rate=330180 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:48:41 2021
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 36000  inst.: 12558896 (ipc=348.9) sim_rate=330497 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:48:42 2021
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 37000  inst.: 12910304 (ipc=348.9) sim_rate=331033 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:48:43 2021
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 38000  inst.: 13259572 (ipc=348.9) sim_rate=331489 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:48:44 2021
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 39000  inst.: 13629308 (ipc=349.5) sim_rate=332422 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:48:45 2021
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 40000  inst.: 13984824 (ipc=349.6) sim_rate=332972 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:48:46 2021
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 41000  inst.: 14344484 (ipc=349.9) sim_rate=333592 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:48:47 2021
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 42000  inst.: 14687016 (ipc=349.7) sim_rate=333795 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:48:48 2021
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 43000  inst.: 15056520 (ipc=350.2) sim_rate=334589 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:48:49 2021
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 44000  inst.: 15410888 (ipc=350.2) sim_rate=335019 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:48:50 2021
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 45000  inst.: 15772716 (ipc=350.5) sim_rate=335589 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:48:51 2021
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 46000  inst.: 16147168 (ipc=351.0) sim_rate=336399 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:48:52 2021
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 47000  inst.: 16499776 (ipc=351.1) sim_rate=336730 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:48:53 2021
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: cycles simulated: 48000  inst.: 16631860 (ipc=346.5) sim_rate=332637 (inst/sec) elapsed = 0:0:00:50 / Mon Jun 14 15:48:54 2021
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=332640

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 50 sec (50 sec)
gpgpu_simulation_rate = 332640 (inst/sec)
gpgpu_simulation_rate = 965 (cycle/sec)
total time is 49782 ms


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_8HmEWb"
Parsing file _cuobjdump_complete_output_8HmEWb
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_12vsTd"
Running: cat _ptx_12vsTd | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_4PvhQf
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_4PvhQf --output-file  /dev/null 2> _ptx_12vsTdinfo"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_12vsTd _ptx2_4PvhQf _ptx_12vsTdinfo"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:48:57 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2500  inst.: 473544 (ipc=189.4) sim_rate=236772 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:48:59 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 3500  inst.: 874228 (ipc=249.8) sim_rate=291409 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:49:00 2021
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 4000  inst.: 1092420 (ipc=273.1) sim_rate=273105 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:49:01 2021
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 5000  inst.: 1472608 (ipc=294.5) sim_rate=294521 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:49:02 2021
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 6000  inst.: 1825436 (ipc=304.2) sim_rate=304239 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:49:03 2021
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 7000  inst.: 2181760 (ipc=311.7) sim_rate=311680 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:49:04 2021
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 8000  inst.: 2555524 (ipc=319.4) sim_rate=319440 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:49:05 2021
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 9000  inst.: 2895792 (ipc=321.8) sim_rate=321754 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:49:06 2021
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 10000  inst.: 3257748 (ipc=325.8) sim_rate=325774 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:49:07 2021
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 11000  inst.: 3600988 (ipc=327.4) sim_rate=327362 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:49:08 2021
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 12000  inst.: 3965588 (ipc=330.5) sim_rate=330465 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:49:09 2021
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 12500  inst.: 4159952 (ipc=332.8) sim_rate=319996 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:49:10 2021
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 13500  inst.: 4478000 (ipc=331.7) sim_rate=319857 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:49:11 2021
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 14500  inst.: 4845224 (ipc=334.2) sim_rate=323014 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:49:12 2021
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 15500  inst.: 5198688 (ipc=335.4) sim_rate=324918 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:49:13 2021
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 16500  inst.: 5561548 (ipc=337.1) sim_rate=327149 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:49:14 2021
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 17500  inst.: 5922888 (ipc=338.5) sim_rate=329049 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:49:15 2021
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 18500  inst.: 6276544 (ipc=339.3) sim_rate=330344 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:49:16 2021
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 19500  inst.: 6644036 (ipc=340.7) sim_rate=332201 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:49:17 2021
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 20500  inst.: 7001524 (ipc=341.5) sim_rate=333405 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:49:18 2021
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim uArch: cycles simulated: 21500  inst.: 7362756 (ipc=342.5) sim_rate=334670 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:49:19 2021
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 22500  inst.: 7729204 (ipc=343.5) sim_rate=336052 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:49:20 2021
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 23500  inst.: 8076140 (ipc=343.7) sim_rate=336505 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:49:21 2021
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 24500  inst.: 8433512 (ipc=344.2) sim_rate=337340 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:49:22 2021
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim uArch: cycles simulated: 25500  inst.: 8795428 (ipc=344.9) sim_rate=338285 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:49:23 2021
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim uArch: cycles simulated: 26500  inst.: 9158844 (ipc=345.6) sim_rate=339216 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:49:24 2021
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 27000  inst.: 9350236 (ipc=346.3) sim_rate=333937 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:49:25 2021
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28000  inst.: 9687816 (ipc=346.0) sim_rate=334062 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:49:26 2021
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 29000  inst.: 10051864 (ipc=346.6) sim_rate=335062 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:49:27 2021
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 30000  inst.: 10408808 (ipc=347.0) sim_rate=335768 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:49:28 2021
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 31000  inst.: 10769580 (ipc=347.4) sim_rate=336549 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:49:29 2021
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 32000  inst.: 11110308 (ipc=347.2) sim_rate=336676 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:49:30 2021
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 33000  inst.: 11488720 (ipc=348.1) sim_rate=337903 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:49:31 2021
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim uArch: cycles simulated: 34000  inst.: 11823760 (ipc=347.8) sim_rate=337821 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:49:32 2021
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 35000  inst.: 12216676 (ipc=349.0) sim_rate=339352 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:49:33 2021
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 36000  inst.: 12558896 (ipc=348.9) sim_rate=339429 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:49:34 2021
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 37000  inst.: 12910304 (ipc=348.9) sim_rate=339744 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:49:35 2021
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 38000  inst.: 13259572 (ipc=348.9) sim_rate=339989 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:49:36 2021
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 39000  inst.: 13629308 (ipc=349.5) sim_rate=340732 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:49:37 2021
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 40000  inst.: 13984824 (ipc=349.6) sim_rate=341093 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:49:38 2021
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 41000  inst.: 14344484 (ipc=349.9) sim_rate=341535 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:49:39 2021
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 42000  inst.: 14687016 (ipc=349.7) sim_rate=341558 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:49:40 2021
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 43000  inst.: 15056520 (ipc=350.2) sim_rate=342193 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:49:41 2021
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 44000  inst.: 15410888 (ipc=350.2) sim_rate=342464 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:49:42 2021
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 45000  inst.: 15772716 (ipc=350.5) sim_rate=342885 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:49:43 2021
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim uArch: cycles simulated: 46000  inst.: 16147168 (ipc=351.0) sim_rate=343556 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:49:44 2021
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 46500  inst.: 16323188 (ipc=351.0) sim_rate=340066 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:49:45 2021
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: cycles simulated: 47500  inst.: 16627484 (ipc=350.1) sim_rate=339336 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:49:46 2021
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=339428

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 49 sec (49 sec)
gpgpu_simulation_rate = 339428 (inst/sec)
gpgpu_simulation_rate = 985 (cycle/sec)
total time is 49481 ms


        *** GPGPU-Sim Simulator Version 3.2.2  [build 0] ***


GPGPU-Sim PTX: simulation mode 0 (can change with PTX_SIM_MODE_FUNC environment variable:
               1=functional simulation only, 0=detailed performance simulator)
GPGPU-Sim: Configuration options:

-network_mode                           1 # Interconnection network mode
-inter_config_file   config_fermi_islip.icnt # Interconnection network config file
-gpgpu_ptx_use_cuobjdump                    1 # Use cuobjdump to extract ptx and sass from binaries
-gpgpu_experimental_lib_support                    0 # Try to extract code from cuda libraries [Broken because of unknown cudaGetExportTable]
-gpgpu_ptx_convert_to_ptxplus                    0 # Convert SASS (native ISA) to ptxplus and run ptxplus
-gpgpu_ptx_force_max_capability                   20 # Force maximum compute capability
-gpgpu_ptx_inst_debug_to_file                    0 # Dump executed instructions' debug information to file
-gpgpu_ptx_inst_debug_file       inst_debug.txt # Executed instructions' debug output file
-gpgpu_ptx_inst_debug_thread_uid                    1 # Thread UID for executed instructions' debug output
-gpgpu_simd_model                       1 # 1 = post-dominator
-gpgpu_shader_core_pipeline              1536:32 # shader core pipeline config, i.e., {<nthread>:<warpsize>}
-gpgpu_tex_cache:l1  4:128:24,L:R:m:N:L,F:128:4,128:2 # per-shader L1 texture cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>:<rf>}
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 # per-shader L1 constant memory cache  (READ-ONLY) config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:il1     4:128:4,L:R:f:N:L,A:2:32,4 # shader L1 instruction cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>} 
-gpgpu_cache:dl1     32:128:4,L:L:m:N:H,A:32:8,8 # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PrefL1                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gpgpu_cache:dl1PreShared                 none # per-shader L1 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq> | none}
-gmem_skip_L1D                          0 # global memory access skip L1D cache (implements -Xptxas -dlcm=cg, default=no skip)
-gpgpu_perfect_mem                      0 # enable perfect memory mode (no cache miss)
-n_regfile_gating_group                    4 # group of lanes that should be read/written together)
-gpgpu_clock_gated_reg_file                    0 # enable clock gated reg file for power calculations
-gpgpu_clock_gated_lanes                    0 # enable clock gated lanes for power calculations
-gpgpu_shader_registers                32768 # Number of registers per shader core. Limits number of concurrent CTAs. (default 8192)
-gpgpu_shader_cta                       8 # Maximum number of concurrent CTAs in shader (default 8)
-gpgpu_num_cta_barriers                   16 # Maximum number of named barriers per CTA (default 16)
-gpgpu_n_clusters                      15 # number of processing clusters
-gpgpu_n_cores_per_cluster                    4 # number of simd cores per cluster
-gpgpu_n_cluster_ejection_buffer_size                    8 # number of packets in ejection buffer
-gpgpu_n_ldst_response_buffer_size                    2 # number of response packets in ld/st unit ejection buffer
-gpgpu_shmem_size                   16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size                   49152 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefL1                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_size_PrefShared                16384 # Size of shared memory per shader core (default 16kB)
-gpgpu_shmem_num_banks                   32 # Number of banks in the shared memory in each shader core (default 16)
-gpgpu_shmem_limited_broadcast                    0 # Limit shared memory to do one broadcast per cycle (default on)
-gpgpu_shmem_warp_parts                    1 # Number of portions a warp is divided into for shared memory bank conflict check 
-gpgpu_warpdistro_shader                   -1 # Specify which shader core to collect the warp size distribution from
-gpgpu_warp_issue_shader                    0 # Specify which shader core to collect the warp issue distribution from
-gpgpu_local_mem_map                    1 # Mapping from local memory space address to simulated GPU physical address space (default = enabled)
-gpgpu_num_reg_banks                   16 # Number of register banks (default = 8)
-gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping registers to banks (default = off)
-gpgpu_operand_collector_num_units_sp                    6 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_sfu                    8 # number of collector units (default = 4)
-gpgpu_operand_collector_num_units_mem                    2 # number of collector units (default = 2)
-gpgpu_operand_collector_num_units_gen                    0 # number of collector units (default = 0)
-gpgpu_operand_collector_num_in_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_in_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_operand_collector_num_out_ports_sp                    2 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_mem                    1 # number of collector unit in ports (default = 1)
-gpgpu_operand_collector_num_out_ports_gen                    0 # number of collector unit in ports (default = 0)
-gpgpu_coalesce_arch                   13 # Coalescing arch (default = 13, anything else is off for now)
-gpgpu_num_sched_per_core                    2 # Number of warp schedulers per core
-gpgpu_max_insn_issue_per_warp                    1 # Max number of instructions that can be issued per warp in one cycle by scheduler
-gpgpu_simt_core_sim_order                    1 # Select the simulation order of cores in a cluster (0=Fix, 1=Round-Robin)
-gpgpu_pipeline_widths        2,1,1,2,1,1,2 # Pipeline widths ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
-gpgpu_num_sp_units                     2 # Number of SP units (default=1)
-gpgpu_num_sfu_units                    1 # Number of SF units (default=1)
-gpgpu_num_mem_units                    1 # Number if ldst units (default=1) WARNING: not hooked up to anything
-gpgpu_scheduler                      gto # Scheduler configuration: < lrr | gto | two_level_active > If two_level_active:<num_active_warps>:<inner_prioritization>:<outer_prioritization>For complete list of prioritization values see shader.h enum scheduler_prioritization_typeDefault: gto
-gpgpu_dram_scheduler                    1 # 0 = fifo, 1 = FR-FCFS (defaul)
-gpgpu_dram_partition_queues              8:8:8:8 # i2$:$2d:d2$:$2i
-l2_ideal                               0 # Use a ideal L2 cache that always hit
-gpgpu_cache:dl2     64:128:8,L:B:m:W:L,A:32:4,4:0,32 # unified banked L2 data cache config  {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>,<mshr>:<N>:<merge>,<mq>}
-gpgpu_cache:dl2_texture_only                    0 # L2 cache used for texture only
-gpgpu_n_mem                            6 # number of memory modules (e.g. memory controllers) in gpu
-gpgpu_n_sub_partition_per_mchannel                    2 # number of memory subpartition in each memory module
-gpgpu_n_mem_per_ctrlr                    2 # number of memory chips per memory controller
-gpgpu_memlatency_stat                   14 # track and display latency statistics 0x2 enables MC, 0x4 enables queue logs
-gpgpu_frfcfs_dram_sched_queue_size                   16 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_return_queue_size                  116 # 0 = unlimited (default); # entries per chip
-gpgpu_dram_buswidth                    4 # default = 4 bytes (8 bytes per cycle at DDR)
-gpgpu_dram_burst_length                    8 # Burst length of each DRAM request (default = 4 data bus cycle)
-dram_data_command_freq_ratio                    4 # Frequency ratio between DRAM data bus and command bus (default = 2 times, i.e. DDR)
-gpgpu_dram_timing_opt nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2 # DRAM timing parameters = {nbk:tCCD:tRRD:tRCD:tRAS:tRP:tRC:CL:WL:tCDLR:tWR:nbkgrp:tCCDL:tRTPL}
-rop_latency                          120 # ROP queue latency (default 85)
-dram_latency                         100 # DRAM latency (default 30)
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS # mapping memory address to dram model {dramid@<start bit>;<memory address map>}
-gpgpu_mem_addr_test                    0 # run sweep test to check address mapping for aliased address
-gpgpu_mem_address_mask                    1 # 0 = old addressing mask, 1 = new addressing mask, 2 = new add. mask + flipped bank sel and chip sel bits
-gpuwattch_xml_file  gpuwattch_gtx480.xml # GPUWattch XML file
-power_simulation_enabled                    1 # Turn on power simulator (1=On, 0=Off)
-power_per_cycle_dump                    0 # Dump detailed power output each cycle
-power_trace_enabled                    0 # produce a file for the power trace (1=On, 0=Off)
-power_trace_zlevel                     6 # Compression level of the power trace output log (0=no comp, 9=highest)
-steady_power_levels_enabled                    0 # produce a file for the steady power levels (1=On, 0=Off)
-steady_state_definition                  8:4 # allowed deviation:number of samples
-gpgpu_max_cycle                        0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_insn                         0 # terminates gpu simulation early (0 = no limit)
-gpgpu_max_cta                          0 # terminates gpu simulation early (0 = no limit)
-gpgpu_runtime_stat                   500 # display runtime statistics such as dram utilization {<freq>:<flag>}
-liveness_message_freq                    1 # Minimum number of seconds between simulation liveness messages (0 = always print)
-gpgpu_flush_l1_cache                    0 # Flush L1 cache at the end of each kernel call
-gpgpu_flush_l2_cache                    0 # Flush L2 cache at the end of each kernel call
-gpgpu_deadlock_detect                    1 # Stop the simulation at deadlock (1=on (default), 0=off)
-gpgpu_ptx_instruction_classification                    0 # if enabled will classify ptx instruction types per kernel (Max 255 kernels now)
-gpgpu_ptx_sim_mode                     0 # Select between Performance (default) or Functional simulation (1)
-gpgpu_clock_domains 700.0:700.0:700.0:924.0 # Clock Domain Frequencies in MhZ {<Core Clock>:<ICNT Clock>:<L2 Clock>:<DRAM Clock>}
-gpgpu_max_concurrent_kernel                    8 # maximum kernels that can run concurrently on GPU
-gpgpu_cflog_interval                    0 # Interval between each snapshot in control flow logger
-visualizer_enabled                     0 # Turn on visualizer output (1=On, 0=Off)
-visualizer_outputfile                 NULL # Specifies the output log file for visualizer
-visualizer_zlevel                      6 # Compression level of the visualizer output log (0=no comp, 9=highest)
-trace_enabled                          0 # Turn on traces
-trace_components                    none # comma seperated list of traces to enable. Complete list found in trace_streams.tup. Default none
-trace_sampling_core                    0 # The core which is printed using CORE_DPRINTF. Default 0
-trace_sampling_memory_partition                   -1 # The memory partition which is printed using MEMPART_DPRINTF. Default -1 (i.e. all)
-enable_ptx_file_line_stats                    1 # Turn on PTX source line statistic profiling. (1 = On)
-ptx_line_stats_filename gpgpu_inst_stats.txt # Output file for PTX source line statistics.
-save_embedded_ptx                      0 # saves ptx files embedded in binary as <n>.ptx
-keep                                   0 # keep intermediate files created by GPGPU-Sim when interfacing with external programs
-gpgpu_ptx_save_converted_ptxplus                    0 # Saved converted ptxplus to a file
-ptx_opcode_latency_int         4,13,4,5,145 # Opcode latencies for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,19,25,145
-ptx_opcode_latency_fp          4,13,4,5,39 # Opcode latencies for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,30
-ptx_opcode_latency_dp         8,19,8,8,330 # Opcode latencies for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,335
-ptx_opcode_initiation_int            1,2,2,1,8 # Opcode initiation intervals for integers <ADD,MAX,MUL,MAD,DIV>Default 1,1,4,4,32
-ptx_opcode_initiation_fp            1,2,1,1,4 # Opcode initiation intervals for single precision floating points <ADD,MAX,MUL,MAD,DIV>Default 1,1,1,1,5
-ptx_opcode_initiation_dp         8,16,8,8,130 # Opcode initiation intervals for double precision floating points <ADD,MAX,MUL,MAD,DIV>Default 8,8,8,8,130
DRAM Timing Options:
nbk                                    16 # number of banks
CCD                                     2 # column to column delay
RRD                                     6 # minimal delay between activation of rows in different banks
RCD                                    12 # row to column delay
RAS                                    28 # time needed to activate row
RP                                     12 # time needed to precharge (deactivate) row
RC                                     40 # row cycle time
CDLR                                    5 # switching from write to read (changes tWTR)
WR                                     12 # last data-in to row precharge
CL                                     12 # CAS latency
WL                                      4 # Write latency
nbkgrp                                  4 # number of bank groups
CCDL                                    3 # column to column delay between accesses to different bank groups
RTPL                                    2 # read to precharge delay between accesses to different bank groups
Total number of memory sub partition = 12
addr_dec_mask[CHIP]  = 0000000000000000 	high:64 low:0
addr_dec_mask[BK]    = 000000000000e100 	high:16 low:8
addr_dec_mask[ROW]   = 000000000fff0000 	high:28 low:16
addr_dec_mask[COL]   = 0000000000001eff 	high:13 low:0
addr_dec_mask[BURST] = 000000000000003f 	high:6 low:0
sub_partition_id_mask = 0000000000000100
GPGPU-Sim uArch: clock freqs: 700000000.000000:700000000.000000:700000000.000000:924000000.000000
GPGPU-Sim uArch: clock periods: 0.00000000142857142857:0.00000000142857142857:0.00000000142857142857:0.00000000108225108225
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: interconnect node map (shaderID+MemID to icntID)
GPGPU-Sim uArch: Memory nodes ID start from index: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
GPGPU-Sim uArch: interconnect node reverse map (icntID to shaderID+MemID)
GPGPU-Sim uArch: Memory nodes start from ID: 15
GPGPU-Sim uArch:    0   1   2   3   4
GPGPU-Sim uArch:    5   6   7   8   9
GPGPU-Sim uArch:   10  11  12  13  14
GPGPU-Sim uArch:   15  16  17  18  19
GPGPU-Sim uArch:   20  21  22  23  24
GPGPU-Sim uArch:   25  26
83a4e518f69376f7e08643a3a9e17862  /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
GPGPU-Sim uArch: performance model initialization complete.
GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = 1, filename=mm.cu
self exe links to: /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM
Running md5sum using "md5sum /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM "
Running cuobjdump using "$CUDA_INSTALL_PATH/bin/cuobjdump -ptx -elf -sass /home/ly/下载/test/gpgpu-sim_distribution-master/ispass2009-benchmarks-master_2/bin/release/MM > _cuobjdump_complete_output_VzHSZO"
Parsing file _cuobjdump_complete_output_VzHSZO
######### cuobjdump parser ########
## Adding new section ELF
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_1.ptx
Adding arch: sm_10
Adding identifier: mm.cu
## Adding new section ELF
Adding arch: sm_20
Adding identifier: mm.cu
## Adding new section PTX
Adding ptx filename: _cuobjdump_2.ptx
Adding arch: sm_20
Adding identifier: mm.cu
Done parsing!!!
GPGPU-Sim PTX: __cudaRegisterFunction _Z14matrix_mul_gpuPiS_S_i : hostFun 0x0x400ce0, fat_cubin_handle = 1
GPGPU-Sim PTX: instruction assembly for function '_Z14matrix_mul_gpuPiS_S_i'...   done.
GPGPU-Sim PTX: finding reconvergence points for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate dominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: Finding immediate postdominators for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'...
GPGPU-Sim PTX: reconvergence points for _Z14matrix_mul_gpuPiS_S_i...
GPGPU-Sim PTX:  1 (potential) branch divergence @  PC=0x048 (_1.ptx:71) @%p1 bra $Lt_0_2306;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX:  2 (potential) branch divergence @  PC=0x130 (_1.ptx:103) @%p2 bra $Lt_0_1794;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:  3 (potential) branch divergence @  PC=0x138 (_1.ptx:104) bra.uni $Lt_0_1282;
GPGPU-Sim PTX:    immediate post dominator      @  PC=0x170 (_1.ptx:114) ld.param.u64 %rd11, [__cudaparm__Z14matrix_mul_gpuPiS_S_i_P];
GPGPU-Sim PTX: ... end of reconvergence points for _Z14matrix_mul_gpuPiS_S_i
GPGPU-Sim PTX: ... done pre-decoding instructions for '_Z14matrix_mul_gpuPiS_S_i'.
GPGPU-Sim PTX: finished parsing EMBEDDED .ptx file _1.ptx
Adding _cuobjdump_2.ptx with cubin handle 1
GPGPU-Sim PTX: extracting embedded .ptx to temporary file "_ptx_llMAAe"
Running: cat _ptx_llMAAe | sed 's/.version 1.5/.version 1.4/' | sed 's/, texmode_independent//' | sed 's/\(\.extern \.const\[1\] .b8 \w\+\)\[\]/\1\[1\]/' | sed 's/const\[.\]/const\[0\]/g' > _ptx2_PzvjbE
GPGPU-Sim PTX: generating ptxinfo using "$CUDA_INSTALL_PATH/bin/ptxas --gpu-name=sm_20 -v _ptx2_PzvjbE --output-file  /dev/null 2> _ptx_llMAAeinfo"
GPGPU-Sim PTX: Kernel '_Z14matrix_mul_gpuPiS_S_i' : regs=14, lmem=0, smem=0, cmem=60
GPGPU-Sim PTX: removing ptxinfo using "rm -f _ptx_llMAAe _ptx2_PzvjbE _ptx_llMAAeinfo"
GPGPU-Sim PTX: loading globals with explicit initializers... 
GPGPU-Sim PTX: finished loading globals (0 bytes total).
GPGPU-Sim PTX: loading constants with explicit initializers...  done.
Block(10,10)   Grid(15,8).

GPGPU-Sim PTX: cudaLaunch for 0x0x400ce0 (mode=performance simulation) on stream 0
GPGPU-Sim PTX: pushing kernel '_Z14matrix_mul_gpuPiS_S_i' to stream 0, gridDim= (15,8,1) blockDim = (10,10,1) 
kernel '_Z14matrix_mul_gpuPiS_S_i' transfer to GPU hardware scheduler
GPGPU-Sim uArch: Shader 4 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: CTA/core = 8, limited by: cta_limit
GPGPU-Sim uArch: core:  4, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 8 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  8, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 12 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 12, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 16 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 16, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 20 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 20, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 24 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 24, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 28 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 28, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 32 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 32, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 36 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 36, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 40 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 40, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 44 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 44, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 48 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 48, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 52 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 52, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 56 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 56, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 0 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  0, cta: 0 initialized @(1,0)
GPGPU-Sim uArch: Shader 5 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  5, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 9 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  9, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 13 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 13, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 17 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 17, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 21 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 21, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 25 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 25, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 29 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 29, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 33 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 33, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 37 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 37, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 41 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 41, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 45 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 45, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 49 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 49, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 53 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 53, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 57 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 57, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 1 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  1, cta: 0 initialized @(2,0)
GPGPU-Sim uArch: Shader 6 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  6, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 10 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 10, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 14 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 14, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 18 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 18, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 22 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 22, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 26 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 26, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 30 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 30, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 34 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 34, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 38 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 38, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 42 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 42, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 46 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 46, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 50 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 50, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 54 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 54, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 58 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 58, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 2 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  2, cta: 0 initialized @(3,0)
GPGPU-Sim uArch: Shader 7 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  7, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 11 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 11, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 15 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 15, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 19 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 19, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 23 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 23, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 27 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 27, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 31 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 31, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 35 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 35, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 39 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 39, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 43 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 43, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 47 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 47, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 51 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 51, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 55 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 55, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 59 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core: 59, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: Shader 3 bind to kernel 1 '_Z14matrix_mul_gpuPiS_S_i'
GPGPU-Sim uArch: core:  3, cta: 0 initialized @(4,0)
GPGPU-Sim uArch: core:  4, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  8, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 12, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 16, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 20, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 24, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 28, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 32, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 36, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 40, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 44, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 48, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 52, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core: 56, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  0, cta: 1 initialized @(5,0)
GPGPU-Sim uArch: core:  5, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  9, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 13, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 17, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 21, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 25, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 29, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 33, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 37, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 41, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 45, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 49, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 53, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core: 57, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  1, cta: 1 initialized @(6,0)
GPGPU-Sim uArch: core:  6, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 10, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 14, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 18, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 22, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 26, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 30, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 34, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 38, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 42, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 46, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 50, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 54, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core: 58, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  2, cta: 1 initialized @(7,0)
GPGPU-Sim uArch: core:  7, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 11, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 15, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 19, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 23, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 27, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 31, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 35, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 39, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 43, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 47, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 51, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 55, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core: 59, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: core:  3, cta: 1 initialized @(8,0)
GPGPU-Sim uArch: cycles simulated: 500  inst.: 49456 (ipc=98.9) sim_rate=49456 (inst/sec) elapsed = 0:0:00:01 / Mon Jun 14 15:49:49 2021
GPGPU-Sim PTX: 100000 instructions simulated : ctaid=(1,0,0) tid=(1,5,0)
GPGPU-Sim PTX: 200000 instructions simulated : ctaid=(0,4,0) tid=(1,1,0)
GPGPU-Sim PTX: 300000 instructions simulated : ctaid=(8,2,0) tid=(5,3,0)
GPGPU-Sim PTX: 400000 instructions simulated : ctaid=(2,2,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 2000  inst.: 374980 (ipc=187.5) sim_rate=187490 (inst/sec) elapsed = 0:0:00:02 / Mon Jun 14 15:49:51 2021
GPGPU-Sim PTX: 500000 instructions simulated : ctaid=(12,1,0) tid=(3,0,0)
GPGPU-Sim PTX: 600000 instructions simulated : ctaid=(3,6,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 3000  inst.: 650956 (ipc=217.0) sim_rate=216985 (inst/sec) elapsed = 0:0:00:03 / Mon Jun 14 15:49:52 2021
GPGPU-Sim PTX: 700000 instructions simulated : ctaid=(8,1,0) tid=(3,8,0)
GPGPU-Sim PTX: 800000 instructions simulated : ctaid=(3,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 900000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 1000000 instructions simulated : ctaid=(8,6,0) tid=(7,8,0)
GPGPU-Sim PTX: 1100000 instructions simulated : ctaid=(1,3,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 4000  inst.: 1092420 (ipc=273.1) sim_rate=273105 (inst/sec) elapsed = 0:0:00:04 / Mon Jun 14 15:49:53 2021
GPGPU-Sim PTX: 1200000 instructions simulated : ctaid=(5,0,0) tid=(9,1,0)
GPGPU-Sim PTX: 1300000 instructions simulated : ctaid=(11,0,0) tid=(9,9,0)
GPGPU-Sim PTX: 1400000 instructions simulated : ctaid=(9,0,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 5000  inst.: 1472608 (ipc=294.5) sim_rate=294521 (inst/sec) elapsed = 0:0:00:05 / Mon Jun 14 15:49:54 2021
GPGPU-Sim PTX: 1500000 instructions simulated : ctaid=(10,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 1600000 instructions simulated : ctaid=(4,5,0) tid=(7,2,0)
GPGPU-Sim PTX: 1700000 instructions simulated : ctaid=(10,2,0) tid=(7,6,0)
GPGPU-Sim PTX: 1800000 instructions simulated : ctaid=(0,3,0) tid=(5,7,0)
GPGPU-Sim uArch: cycles simulated: 6000  inst.: 1825436 (ipc=304.2) sim_rate=304239 (inst/sec) elapsed = 0:0:00:06 / Mon Jun 14 15:49:55 2021
GPGPU-Sim PTX: 1900000 instructions simulated : ctaid=(1,1,0) tid=(7,6,0)
GPGPU-Sim PTX: 2000000 instructions simulated : ctaid=(14,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 2100000 instructions simulated : ctaid=(2,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 2200000 instructions simulated : ctaid=(5,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 7000  inst.: 2181760 (ipc=311.7) sim_rate=311680 (inst/sec) elapsed = 0:0:00:07 / Mon Jun 14 15:49:56 2021
GPGPU-Sim PTX: 2300000 instructions simulated : ctaid=(1,2,0) tid=(1,9,0)
GPGPU-Sim PTX: 2400000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 2500000 instructions simulated : ctaid=(14,2,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 8000  inst.: 2555524 (ipc=319.4) sim_rate=319440 (inst/sec) elapsed = 0:0:00:08 / Mon Jun 14 15:49:57 2021
GPGPU-Sim PTX: 2600000 instructions simulated : ctaid=(11,4,0) tid=(5,3,0)
GPGPU-Sim PTX: 2700000 instructions simulated : ctaid=(0,1,0) tid=(7,8,0)
GPGPU-Sim PTX: 2800000 instructions simulated : ctaid=(1,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 2900000 instructions simulated : ctaid=(6,6,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 9000  inst.: 2895792 (ipc=321.8) sim_rate=321754 (inst/sec) elapsed = 0:0:00:09 / Mon Jun 14 15:49:58 2021
GPGPU-Sim PTX: 3000000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 9500  inst.: 3069396 (ipc=323.1) sim_rate=306939 (inst/sec) elapsed = 0:0:00:10 / Mon Jun 14 15:49:59 2021
GPGPU-Sim PTX: 3100000 instructions simulated : ctaid=(9,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 3200000 instructions simulated : ctaid=(5,1,0) tid=(7,2,0)
GPGPU-Sim PTX: 3300000 instructions simulated : ctaid=(12,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 3400000 instructions simulated : ctaid=(0,0,0) tid=(5,1,0)
GPGPU-Sim uArch: cycles simulated: 10500  inst.: 3416504 (ipc=325.4) sim_rate=310591 (inst/sec) elapsed = 0:0:00:11 / Mon Jun 14 15:50:00 2021
GPGPU-Sim PTX: 3500000 instructions simulated : ctaid=(14,7,0) tid=(3,8,0)
GPGPU-Sim PTX: 3600000 instructions simulated : ctaid=(8,0,0) tid=(1,7,0)
GPGPU-Sim PTX: 3700000 instructions simulated : ctaid=(12,3,0) tid=(5,3,0)
GPGPU-Sim PTX: 3800000 instructions simulated : ctaid=(13,2,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 11500  inst.: 3804716 (ipc=330.8) sim_rate=317059 (inst/sec) elapsed = 0:0:00:12 / Mon Jun 14 15:50:01 2021
GPGPU-Sim PTX: 3900000 instructions simulated : ctaid=(4,7,0) tid=(9,1,0)
GPGPU-Sim PTX: 4000000 instructions simulated : ctaid=(4,0,0) tid=(5,1,0)
GPGPU-Sim PTX: 4100000 instructions simulated : ctaid=(5,6,0) tid=(7,2,0)
GPGPU-Sim uArch: cycles simulated: 12500  inst.: 4159952 (ipc=332.8) sim_rate=319996 (inst/sec) elapsed = 0:0:00:13 / Mon Jun 14 15:50:02 2021
GPGPU-Sim PTX: 4200000 instructions simulated : ctaid=(3,3,0) tid=(1,5,0)
GPGPU-Sim PTX: 4300000 instructions simulated : ctaid=(5,2,0) tid=(5,7,0)
GPGPU-Sim PTX: 4400000 instructions simulated : ctaid=(1,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 4500000 instructions simulated : ctaid=(4,3,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 13500  inst.: 4478000 (ipc=331.7) sim_rate=319857 (inst/sec) elapsed = 0:0:00:14 / Mon Jun 14 15:50:03 2021
GPGPU-Sim PTX: 4600000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4700000 instructions simulated : ctaid=(12,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 4800000 instructions simulated : ctaid=(12,7,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 14500  inst.: 4845224 (ipc=334.2) sim_rate=323014 (inst/sec) elapsed = 0:0:00:15 / Mon Jun 14 15:50:04 2021
GPGPU-Sim PTX: 4900000 instructions simulated : ctaid=(14,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 5000000 instructions simulated : ctaid=(14,2,0) tid=(9,1,0)
GPGPU-Sim PTX: 5100000 instructions simulated : ctaid=(7,3,0) tid=(3,8,0)
GPGPU-Sim PTX: 5200000 instructions simulated : ctaid=(14,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 15500  inst.: 5198688 (ipc=335.4) sim_rate=324918 (inst/sec) elapsed = 0:0:00:16 / Mon Jun 14 15:50:05 2021
GPGPU-Sim PTX: 5300000 instructions simulated : ctaid=(9,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 5400000 instructions simulated : ctaid=(3,5,0) tid=(7,6,0)
GPGPU-Sim PTX: 5500000 instructions simulated : ctaid=(4,3,0) tid=(5,3,0)
GPGPU-Sim uArch: cycles simulated: 16500  inst.: 5561548 (ipc=337.1) sim_rate=327149 (inst/sec) elapsed = 0:0:00:17 / Mon Jun 14 15:50:06 2021
GPGPU-Sim PTX: 5600000 instructions simulated : ctaid=(5,1,0) tid=(9,9,0)
GPGPU-Sim PTX: 5700000 instructions simulated : ctaid=(8,0,0) tid=(3,4,0)
GPGPU-Sim PTX: 5800000 instructions simulated : ctaid=(3,1,0) tid=(1,3,0)
GPGPU-Sim PTX: 5900000 instructions simulated : ctaid=(0,7,0) tid=(9,1,0)
GPGPU-Sim uArch: cycles simulated: 17500  inst.: 5922888 (ipc=338.5) sim_rate=329049 (inst/sec) elapsed = 0:0:00:18 / Mon Jun 14 15:50:07 2021
GPGPU-Sim PTX: 6000000 instructions simulated : ctaid=(3,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 6100000 instructions simulated : ctaid=(2,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 6200000 instructions simulated : ctaid=(2,0,0) tid=(9,3,0)
GPGPU-Sim PTX: 6300000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 18500  inst.: 6276544 (ipc=339.3) sim_rate=330344 (inst/sec) elapsed = 0:0:00:19 / Mon Jun 14 15:50:08 2021
GPGPU-Sim PTX: 6400000 instructions simulated : ctaid=(10,0,0) tid=(1,7,0)
GPGPU-Sim PTX: 6500000 instructions simulated : ctaid=(13,3,0) tid=(7,4,0)
GPGPU-Sim PTX: 6600000 instructions simulated : ctaid=(8,0,0) tid=(7,8,0)
GPGPU-Sim uArch: cycles simulated: 19500  inst.: 6644036 (ipc=340.7) sim_rate=332201 (inst/sec) elapsed = 0:0:00:20 / Mon Jun 14 15:50:09 2021
GPGPU-Sim PTX: 6700000 instructions simulated : ctaid=(8,6,0) tid=(5,1,0)
GPGPU-Sim PTX: 6800000 instructions simulated : ctaid=(3,7,0) tid=(1,9,0)
GPGPU-Sim PTX: 6900000 instructions simulated : ctaid=(5,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 7000000 instructions simulated : ctaid=(4,5,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 20500  inst.: 7001524 (ipc=341.5) sim_rate=333405 (inst/sec) elapsed = 0:0:00:21 / Mon Jun 14 15:50:10 2021
GPGPU-Sim PTX: 7100000 instructions simulated : ctaid=(11,0,0) tid=(3,2,0)
GPGPU-Sim PTX: 7200000 instructions simulated : ctaid=(12,4,0) tid=(9,9,0)
GPGPU-Sim PTX: 7300000 instructions simulated : ctaid=(12,0,0) tid=(5,1,0)
GPGPU-Sim uArch: cycles simulated: 21500  inst.: 7362756 (ipc=342.5) sim_rate=334670 (inst/sec) elapsed = 0:0:00:22 / Mon Jun 14 15:50:11 2021
GPGPU-Sim PTX: 7400000 instructions simulated : ctaid=(1,4,0) tid=(1,5,0)
GPGPU-Sim PTX: 7500000 instructions simulated : ctaid=(11,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 7600000 instructions simulated : ctaid=(4,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 7700000 instructions simulated : ctaid=(11,7,0) tid=(5,9,0)
GPGPU-Sim uArch: cycles simulated: 22500  inst.: 7729204 (ipc=343.5) sim_rate=336052 (inst/sec) elapsed = 0:0:00:23 / Mon Jun 14 15:50:12 2021
GPGPU-Sim PTX: 7800000 instructions simulated : ctaid=(11,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 7900000 instructions simulated : ctaid=(7,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 8000000 instructions simulated : ctaid=(8,7,0) tid=(1,5,0)
GPGPU-Sim PTX: 8100000 instructions simulated : ctaid=(4,0,0) tid=(3,6,0)
GPGPU-Sim uArch: cycles simulated: 23500  inst.: 8076140 (ipc=343.7) sim_rate=336505 (inst/sec) elapsed = 0:0:00:24 / Mon Jun 14 15:50:13 2021
GPGPU-Sim PTX: 8200000 instructions simulated : ctaid=(6,3,0) tid=(7,0,0)
GPGPU-Sim PTX: 8300000 instructions simulated : ctaid=(12,7,0) tid=(3,6,0)
GPGPU-Sim PTX: 8400000 instructions simulated : ctaid=(5,5,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 24500  inst.: 8433512 (ipc=344.2) sim_rate=337340 (inst/sec) elapsed = 0:0:00:25 / Mon Jun 14 15:50:14 2021
GPGPU-Sim PTX: 8500000 instructions simulated : ctaid=(1,2,0) tid=(1,3,0)
GPGPU-Sim PTX: 8600000 instructions simulated : ctaid=(13,7,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 25000  inst.: 8616376 (ipc=344.7) sim_rate=331399 (inst/sec) elapsed = 0:0:00:26 / Mon Jun 14 15:50:15 2021
GPGPU-Sim PTX: 8700000 instructions simulated : ctaid=(3,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 8800000 instructions simulated : ctaid=(11,4,0) tid=(9,5,0)
GPGPU-Sim PTX: 8900000 instructions simulated : ctaid=(10,7,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 26000  inst.: 8970768 (ipc=345.0) sim_rate=332250 (inst/sec) elapsed = 0:0:00:27 / Mon Jun 14 15:50:16 2021
GPGPU-Sim PTX: 9000000 instructions simulated : ctaid=(9,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 9100000 instructions simulated : ctaid=(10,1,0) tid=(9,3,0)
GPGPU-Sim PTX: 9200000 instructions simulated : ctaid=(4,6,0) tid=(1,9,0)
GPGPU-Sim PTX: 9300000 instructions simulated : ctaid=(8,4,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 27000  inst.: 9350236 (ipc=346.3) sim_rate=333937 (inst/sec) elapsed = 0:0:00:28 / Mon Jun 14 15:50:17 2021
GPGPU-Sim PTX: 9400000 instructions simulated : ctaid=(6,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 9500000 instructions simulated : ctaid=(8,1,0) tid=(5,7,0)
GPGPU-Sim PTX: 9600000 instructions simulated : ctaid=(5,1,0) tid=(9,7,0)
GPGPU-Sim PTX: 9700000 instructions simulated : ctaid=(7,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 28000  inst.: 9687816 (ipc=346.0) sim_rate=334062 (inst/sec) elapsed = 0:0:00:29 / Mon Jun 14 15:50:18 2021
GPGPU-Sim PTX: 9800000 instructions simulated : ctaid=(1,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 9900000 instructions simulated : ctaid=(1,7,0) tid=(7,8,0)
GPGPU-Sim PTX: 10000000 instructions simulated : ctaid=(1,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 29000  inst.: 10051864 (ipc=346.6) sim_rate=335062 (inst/sec) elapsed = 0:0:00:30 / Mon Jun 14 15:50:19 2021
GPGPU-Sim PTX: 10100000 instructions simulated : ctaid=(13,3,0) tid=(3,4,0)
GPGPU-Sim PTX: 10200000 instructions simulated : ctaid=(3,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 10300000 instructions simulated : ctaid=(2,1,0) tid=(3,4,0)
GPGPU-Sim PTX: 10400000 instructions simulated : ctaid=(11,2,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 30000  inst.: 10408808 (ipc=347.0) sim_rate=335768 (inst/sec) elapsed = 0:0:00:31 / Mon Jun 14 15:50:20 2021
GPGPU-Sim PTX: 10500000 instructions simulated : ctaid=(6,6,0) tid=(1,1,0)
GPGPU-Sim PTX: 10600000 instructions simulated : ctaid=(12,4,0) tid=(3,0,0)
GPGPU-Sim PTX: 10700000 instructions simulated : ctaid=(0,4,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 31000  inst.: 10769580 (ipc=347.4) sim_rate=336549 (inst/sec) elapsed = 0:0:00:32 / Mon Jun 14 15:50:21 2021
GPGPU-Sim PTX: 10800000 instructions simulated : ctaid=(14,4,0) tid=(7,6,0)
GPGPU-Sim PTX: 10900000 instructions simulated : ctaid=(7,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 11000000 instructions simulated : ctaid=(6,3,0) tid=(5,5,0)
GPGPU-Sim PTX: 11100000 instructions simulated : ctaid=(13,2,0) tid=(7,4,0)
GPGPU-Sim uArch: cycles simulated: 32000  inst.: 11110308 (ipc=347.2) sim_rate=336676 (inst/sec) elapsed = 0:0:00:33 / Mon Jun 14 15:50:22 2021
GPGPU-Sim PTX: 11200000 instructions simulated : ctaid=(6,4,0) tid=(7,0,0)
GPGPU-Sim PTX: 11300000 instructions simulated : ctaid=(3,6,0) tid=(9,9,0)
GPGPU-Sim PTX: 11400000 instructions simulated : ctaid=(13,4,0) tid=(3,8,0)
GPGPU-Sim PTX: 11500000 instructions simulated : ctaid=(4,5,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 33000  inst.: 11488720 (ipc=348.1) sim_rate=337903 (inst/sec) elapsed = 0:0:00:34 / Mon Jun 14 15:50:23 2021
GPGPU-Sim PTX: 11600000 instructions simulated : ctaid=(12,2,0) tid=(1,1,0)
GPGPU-Sim PTX: 11700000 instructions simulated : ctaid=(12,3,0) tid=(5,1,0)
GPGPU-Sim PTX: 11800000 instructions simulated : ctaid=(7,1,0) tid=(7,0,0)
GPGPU-Sim uArch: cycles simulated: 34000  inst.: 11823760 (ipc=347.8) sim_rate=337821 (inst/sec) elapsed = 0:0:00:35 / Mon Jun 14 15:50:24 2021
GPGPU-Sim PTX: 11900000 instructions simulated : ctaid=(2,2,0) tid=(5,5,0)
GPGPU-Sim PTX: 12000000 instructions simulated : ctaid=(8,6,0) tid=(9,1,0)
GPGPU-Sim PTX: 12100000 instructions simulated : ctaid=(2,3,0) tid=(1,9,0)
GPGPU-Sim PTX: 12200000 instructions simulated : ctaid=(11,4,0) tid=(1,5,0)
GPGPU-Sim uArch: cycles simulated: 35000  inst.: 12216676 (ipc=349.0) sim_rate=339352 (inst/sec) elapsed = 0:0:00:36 / Mon Jun 14 15:50:25 2021
GPGPU-Sim PTX: 12300000 instructions simulated : ctaid=(2,5,0) tid=(1,5,0)
GPGPU-Sim PTX: 12400000 instructions simulated : ctaid=(0,2,0) tid=(9,5,0)
GPGPU-Sim PTX: 12500000 instructions simulated : ctaid=(2,5,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 36000  inst.: 12558896 (ipc=348.9) sim_rate=339429 (inst/sec) elapsed = 0:0:00:37 / Mon Jun 14 15:50:26 2021
GPGPU-Sim PTX: 12600000 instructions simulated : ctaid=(0,5,0) tid=(1,1,0)
GPGPU-Sim PTX: 12700000 instructions simulated : ctaid=(6,3,0) tid=(7,2,0)
GPGPU-Sim PTX: 12800000 instructions simulated : ctaid=(5,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 12900000 instructions simulated : ctaid=(12,3,0) tid=(3,4,0)
GPGPU-Sim uArch: cycles simulated: 37000  inst.: 12910304 (ipc=348.9) sim_rate=339744 (inst/sec) elapsed = 0:0:00:38 / Mon Jun 14 15:50:27 2021
GPGPU-Sim PTX: 13000000 instructions simulated : ctaid=(14,2,0) tid=(1,5,0)
GPGPU-Sim PTX: 13100000 instructions simulated : ctaid=(11,5,0) tid=(1,3,0)
GPGPU-Sim PTX: 13200000 instructions simulated : ctaid=(8,4,0) tid=(3,8,0)
GPGPU-Sim uArch: cycles simulated: 38000  inst.: 13259572 (ipc=348.9) sim_rate=339989 (inst/sec) elapsed = 0:0:00:39 / Mon Jun 14 15:50:28 2021
GPGPU-Sim PTX: 13300000 instructions simulated : ctaid=(1,6,0) tid=(7,0,0)
GPGPU-Sim PTX: 13400000 instructions simulated : ctaid=(1,6,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 38500  inst.: 13449032 (ipc=349.3) sim_rate=336225 (inst/sec) elapsed = 0:0:00:40 / Mon Jun 14 15:50:29 2021
GPGPU-Sim PTX: 13500000 instructions simulated : ctaid=(3,0,0) tid=(1,3,0)
GPGPU-Sim PTX: 13600000 instructions simulated : ctaid=(7,5,0) tid=(3,4,0)
GPGPU-Sim PTX: 13700000 instructions simulated : ctaid=(3,7,0) tid=(7,4,0)
GPGPU-Sim PTX: 13800000 instructions simulated : ctaid=(2,1,0) tid=(1,3,0)
GPGPU-Sim uArch: cycles simulated: 39500  inst.: 13803876 (ipc=349.5) sim_rate=336679 (inst/sec) elapsed = 0:0:00:41 / Mon Jun 14 15:50:30 2021
GPGPU-Sim PTX: 13900000 instructions simulated : ctaid=(1,2,0) tid=(3,6,0)
GPGPU-Sim PTX: 14000000 instructions simulated : ctaid=(1,4,0) tid=(9,1,0)
GPGPU-Sim PTX: 14100000 instructions simulated : ctaid=(0,2,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 40500  inst.: 14167076 (ipc=349.8) sim_rate=337311 (inst/sec) elapsed = 0:0:00:42 / Mon Jun 14 15:50:31 2021
GPGPU-Sim PTX: 14200000 instructions simulated : ctaid=(9,7,0) tid=(9,7,0)
GPGPU-Sim PTX: 14300000 instructions simulated : ctaid=(4,6,0) tid=(3,8,0)
GPGPU-Sim PTX: 14400000 instructions simulated : ctaid=(4,5,0) tid=(9,7,0)
GPGPU-Sim PTX: 14500000 instructions simulated : ctaid=(8,6,0) tid=(7,6,0)
GPGPU-Sim uArch: cycles simulated: 41500  inst.: 14512928 (ipc=349.7) sim_rate=337509 (inst/sec) elapsed = 0:0:00:43 / Mon Jun 14 15:50:32 2021
GPGPU-Sim PTX: 14600000 instructions simulated : ctaid=(4,3,0) tid=(3,2,0)
GPGPU-Sim PTX: 14700000 instructions simulated : ctaid=(10,5,0) tid=(7,4,0)
GPGPU-Sim PTX: 14800000 instructions simulated : ctaid=(11,6,0) tid=(1,9,0)
GPGPU-Sim uArch: cycles simulated: 42500  inst.: 14871404 (ipc=349.9) sim_rate=337986 (inst/sec) elapsed = 0:0:00:44 / Mon Jun 14 15:50:33 2021
GPGPU-Sim PTX: 14900000 instructions simulated : ctaid=(7,0,0) tid=(9,7,0)
GPGPU-Sim PTX: 15000000 instructions simulated : ctaid=(14,6,0) tid=(3,4,0)
GPGPU-Sim PTX: 15100000 instructions simulated : ctaid=(9,1,0) tid=(3,6,0)
GPGPU-Sim PTX: 15200000 instructions simulated : ctaid=(3,2,0) tid=(5,5,0)
GPGPU-Sim uArch: cycles simulated: 43500  inst.: 15233248 (ipc=350.2) sim_rate=338516 (inst/sec) elapsed = 0:0:00:45 / Mon Jun 14 15:50:34 2021
GPGPU-Sim PTX: 15300000 instructions simulated : ctaid=(14,1,0) tid=(5,5,0)
GPGPU-Sim PTX: 15400000 instructions simulated : ctaid=(0,2,0) tid=(7,4,0)
GPGPU-Sim PTX: 15500000 instructions simulated : ctaid=(0,0,0) tid=(7,8,0)
GPGPU-Sim PTX: 15600000 instructions simulated : ctaid=(1,1,0) tid=(3,0,0)
GPGPU-Sim uArch: cycles simulated: 44500  inst.: 15586264 (ipc=350.3) sim_rate=338831 (inst/sec) elapsed = 0:0:00:46 / Mon Jun 14 15:50:35 2021
GPGPU-Sim PTX: 15700000 instructions simulated : ctaid=(6,3,0) tid=(3,6,0)
GPGPU-Sim PTX: 15800000 instructions simulated : ctaid=(4,7,0) tid=(3,4,0)
GPGPU-Sim PTX: 15900000 instructions simulated : ctaid=(0,2,0) tid=(9,7,0)
GPGPU-Sim uArch: cycles simulated: 45500  inst.: 15949040 (ipc=350.5) sim_rate=339341 (inst/sec) elapsed = 0:0:00:47 / Mon Jun 14 15:50:36 2021
GPGPU-Sim PTX: 16000000 instructions simulated : ctaid=(9,5,0) tid=(9,1,0)
GPGPU-Sim PTX: 16100000 instructions simulated : ctaid=(11,5,0) tid=(1,7,0)
GPGPU-Sim PTX: 16200000 instructions simulated : ctaid=(6,0,0) tid=(3,8,0)
GPGPU-Sim PTX: 16300000 instructions simulated : ctaid=(4,4,0) tid=(1,1,0)
GPGPU-Sim uArch: cycles simulated: 46500  inst.: 16323188 (ipc=351.0) sim_rate=340066 (inst/sec) elapsed = 0:0:00:48 / Mon Jun 14 15:50:37 2021
GPGPU-Sim PTX: 16400000 instructions simulated : ctaid=(5,1,0) tid=(3,2,0)
GPGPU-Sim PTX: 16500000 instructions simulated : ctaid=(4,6,0) tid=(9,5,0)
GPGPU-Sim PTX: 16600000 instructions simulated : ctaid=(10,2,0) tid=(5,1,0)
GPGPU-Sim uArch: Shader 16 finished CTA #0 (47265,0), 1 CTAs running
GPGPU-Sim uArch: Shader 29 finished CTA #0 (47414,0), 1 CTAs running
GPGPU-Sim uArch: Shader 5 finished CTA #0 (47423,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #0 (47436,0), 1 CTAs running
GPGPU-Sim uArch: Shader 51 finished CTA #0 (47497,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #0 (47499,0), 1 CTAs running
GPGPU-Sim uArch: cycles simulated: 47500  inst.: 16627484 (ipc=350.1) sim_rate=339336 (inst/sec) elapsed = 0:0:00:49 / Mon Jun 14 15:50:38 2021
GPGPU-Sim uArch: Shader 14 finished CTA #0 (47501,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #0 (47552,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #0 (47559,0), 1 CTAs running
GPGPU-Sim uArch: Shader 14 finished CTA #1 (47584,0), 0 CTAs running
GPGPU-Sim uArch: Shader 14 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 39 finished CTA #0 (47615,0), 1 CTAs running
GPGPU-Sim uArch: Shader 16 finished CTA #1 (47620,0), 0 CTAs running
GPGPU-Sim uArch: Shader 16 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #0 (47632,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #0 (47633,0), 1 CTAs running
GPGPU-Sim uArch: Shader 7 finished CTA #0 (47636,0), 1 CTAs running
GPGPU-Sim uArch: Shader 17 finished CTA #1 (47654,0), 0 CTAs running
GPGPU-Sim uArch: Shader 17 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 10 finished CTA #0 (47661,0), 1 CTAs running
GPGPU-Sim uArch: Shader 34 finished CTA #1 (47663,0), 1 CTAs running
GPGPU-Sim uArch: Shader 53 finished CTA #0 (47671,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #0 (47690,0), 1 CTAs running
GPGPU-Sim uArch: Shader 18 finished CTA #1 (47725,0), 0 CTAs running
GPGPU-Sim uArch: Shader 18 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 27 finished CTA #0 (47732,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #0 (47734,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #0 (47754,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #0 (47758,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #0 (47760,0), 1 CTAs running
GPGPU-Sim uArch: Shader 12 finished CTA #1 (47767,0), 0 CTAs running
GPGPU-Sim uArch: Shader 12 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 5 finished CTA #1 (47768,0), 0 CTAs running
GPGPU-Sim uArch: Shader 5 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 32 finished CTA #0 (47776,0), 1 CTAs running
GPGPU-Sim uArch: Shader 27 finished CTA #1 (47790,0), 0 CTAs running
GPGPU-Sim uArch: Shader 27 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #0 (47794,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #0 (47796,0), 1 CTAs running
GPGPU-Sim uArch: Shader 8 finished CTA #1 (47798,0), 0 CTAs running
GPGPU-Sim uArch: Shader 8 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #0 (47811,0), 1 CTAs running
GPGPU-Sim uArch: Shader 25 finished CTA #0 (47812,0), 1 CTAs running
GPGPU-Sim uArch: Shader 3 finished CTA #1 (47813,0), 0 CTAs running
GPGPU-Sim uArch: Shader 3 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 13 finished CTA #1 (47818,0), 1 CTAs running
GPGPU-Sim uArch: Shader 13 finished CTA #0 (47838,0), 0 CTAs running
GPGPU-Sim uArch: Shader 13 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 44 finished CTA #0 (47844,0), 1 CTAs running
GPGPU-Sim uArch: Shader 32 finished CTA #1 (47848,0), 0 CTAs running
GPGPU-Sim uArch: Shader 32 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 50 finished CTA #1 (47862,0), 1 CTAs running
GPGPU-Sim uArch: Shader 23 finished CTA #1 (47863,0), 0 CTAs running
GPGPU-Sim uArch: Shader 23 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 53 finished CTA #1 (47865,0), 0 CTAs running
GPGPU-Sim uArch: Shader 53 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 7 finished CTA #1 (47867,0), 0 CTAs running
GPGPU-Sim uArch: Shader 7 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #1 (47875,0), 1 CTAs running
GPGPU-Sim uArch: Shader 52 finished CTA #1 (47884,0), 1 CTAs running
GPGPU-Sim uArch: Shader 19 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 19 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 40 finished CTA #1 (47887,0), 0 CTAs running
GPGPU-Sim uArch: Shader 40 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 6 finished CTA #0 (47888,0), 1 CTAs running
GPGPU-Sim uArch: Shader 57 finished CTA #0 (47892,0), 1 CTAs running
GPGPU-Sim uArch: Shader 55 finished CTA #1 (47904,0), 0 CTAs running
GPGPU-Sim uArch: Shader 55 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 52 finished CTA #0 (47906,0), 0 CTAs running
GPGPU-Sim uArch: Shader 52 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 20 finished CTA #0 (47909,0), 0 CTAs running
GPGPU-Sim uArch: Shader 20 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 0 finished CTA #0 (47914,0), 1 CTAs running
GPGPU-Sim uArch: Shader 10 finished CTA #1 (47915,0), 0 CTAs running
GPGPU-Sim uArch: Shader 10 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 29 finished CTA #1 (47919,0), 0 CTAs running
GPGPU-Sim uArch: Shader 29 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 11 finished CTA #0 (47938,0), 1 CTAs running
GPGPU-Sim uArch: Shader 45 finished CTA #1 (47963,0), 0 CTAs running
GPGPU-Sim uArch: Shader 45 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #1 (47964,0), 1 CTAs running
GPGPU-Sim uArch: Shader 11 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 11 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 57 finished CTA #1 (47965,0), 0 CTAs running
GPGPU-Sim uArch: Shader 57 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #0 (47967,0), 1 CTAs running
GPGPU-Sim uArch: Shader 43 finished CTA #0 (47975,0), 1 CTAs running
GPGPU-Sim uArch: Shader 47 finished CTA #0 (47978,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #0 (47985,0), 1 CTAs running
GPGPU-Sim uArch: Shader 36 finished CTA #0 (47987,0), 1 CTAs running
GPGPU-Sim uArch: Shader 4 finished CTA #1 (47988,0), 0 CTAs running
GPGPU-Sim uArch: Shader 4 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 9 finished CTA #1 (47991,0), 0 CTAs running
GPGPU-Sim uArch: Shader 9 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #0 (47996,0), 1 CTAs running
GPGPU-Sim uArch: Shader 21 finished CTA #1 (48000,0), 0 CTAs running
GPGPU-Sim uArch: Shader 21 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 51 finished CTA #1 (48001,0), 0 CTAs running
GPGPU-Sim uArch: Shader 51 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 15 finished CTA #0 (48008,0), 0 CTAs running
GPGPU-Sim uArch: Shader 15 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #0 (48010,0), 1 CTAs running
GPGPU-Sim uArch: Shader 30 finished CTA #0 (48018,0), 1 CTAs running
GPGPU-Sim uArch: Shader 0 finished CTA #1 (48019,0), 0 CTAs running
GPGPU-Sim uArch: Shader 0 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 54 finished CTA #0 (48026,0), 1 CTAs running
GPGPU-Sim uArch: Shader 6 finished CTA #1 (48028,0), 0 CTAs running
GPGPU-Sim uArch: Shader 6 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #0 (48031,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #0 (48042,0), 1 CTAs running
GPGPU-Sim uArch: Shader 54 finished CTA #1 (48056,0), 0 CTAs running
GPGPU-Sim uArch: Shader 54 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 24 finished CTA #1 (48059,0), 0 CTAs running
GPGPU-Sim uArch: Shader 24 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #1 (48061,0), 1 CTAs running
GPGPU-Sim uArch: Shader 48 finished CTA #1 (48063,0), 0 CTAs running
GPGPU-Sim uArch: Shader 48 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 30 finished CTA #1 (48070,0), 0 CTAs running
GPGPU-Sim uArch: Shader 30 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 25 finished CTA #1 (48079,0), 0 CTAs running
GPGPU-Sim uArch: Shader 25 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #1 (48081,0), 1 CTAs running
GPGPU-Sim uArch: Shader 59 finished CTA #1 (48083,0), 0 CTAs running
GPGPU-Sim uArch: Shader 59 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #0 (48084,0), 1 CTAs running
GPGPU-Sim uArch: Shader 49 finished CTA #1 (48086,0), 1 CTAs running
GPGPU-Sim uArch: Shader 39 finished CTA #1 (48096,0), 0 CTAs running
GPGPU-Sim uArch: Shader 39 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 2 finished CTA #0 (48098,0), 0 CTAs running
GPGPU-Sim uArch: Shader 2 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 49 finished CTA #0 (48102,0), 0 CTAs running
GPGPU-Sim uArch: Shader 49 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 28 finished CTA #0 (48104,0), 1 CTAs running
GPGPU-Sim uArch: Shader 28 finished CTA #1 (48107,0), 0 CTAs running
GPGPU-Sim uArch: Shader 28 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #0 (48111,0), 1 CTAs running
GPGPU-Sim uArch: Shader 50 finished CTA #0 (48111,0), 0 CTAs running
GPGPU-Sim uArch: Shader 50 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 22 finished CTA #1 (48114,0), 0 CTAs running
GPGPU-Sim uArch: Shader 22 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 58 finished CTA #1 (48132,0), 0 CTAs running
GPGPU-Sim uArch: Shader 58 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 47 finished CTA #1 (48143,0), 0 CTAs running
GPGPU-Sim uArch: Shader 47 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #0 (48153,0), 1 CTAs running
GPGPU-Sim uArch: Shader 35 finished CTA #1 (48153,0), 0 CTAs running
GPGPU-Sim uArch: Shader 35 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 36 finished CTA #1 (48154,0), 0 CTAs running
GPGPU-Sim uArch: Shader 36 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 31 finished CTA #1 (48160,0), 0 CTAs running
GPGPU-Sim uArch: Shader 31 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 26 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 26 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 33 finished CTA #1 (48163,0), 0 CTAs running
GPGPU-Sim uArch: Shader 33 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 38 finished CTA #0 (48165,0), 0 CTAs running
GPGPU-Sim uArch: Shader 38 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 34 finished CTA #0 (48177,0), 0 CTAs running
GPGPU-Sim uArch: Shader 34 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 56 finished CTA #1 (48188,0), 0 CTAs running
GPGPU-Sim uArch: Shader 56 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #1 (48189,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #0 (48192,0), 1 CTAs running
GPGPU-Sim uArch: Shader 37 finished CTA #1 (48202,0), 1 CTAs running
GPGPU-Sim uArch: Shader 1 finished CTA #1 (48212,0), 0 CTAs running
GPGPU-Sim uArch: Shader 1 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 41 finished CTA #0 (48216,0), 0 CTAs running
GPGPU-Sim uArch: Shader 41 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 37 finished CTA #0 (48218,0), 0 CTAs running
GPGPU-Sim uArch: Shader 37 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 46 finished CTA #1 (48221,0), 0 CTAs running
GPGPU-Sim uArch: Shader 46 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #0 (48226,0), 1 CTAs running
GPGPU-Sim uArch: Shader 44 finished CTA #1 (48233,0), 0 CTAs running
GPGPU-Sim uArch: Shader 44 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 42 finished CTA #1 (48248,0), 0 CTAs running
GPGPU-Sim uArch: Shader 42 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: Shader 43 finished CTA #1 (48281,0), 0 CTAs running
GPGPU-Sim uArch: Shader 43 empty (release kernel 1 '_Z14matrix_mul_gpuPiS_S_i').
GPGPU-Sim uArch: GPU detected kernel '_Z14matrix_mul_gpuPiS_S_i' finished on shader 43.
kernel_name = _Z14matrix_mul_gpuPiS_S_i 
kernel_launch_uid = 1 
gpu_sim_cycle = 48282
gpu_sim_insn = 16632000
gpu_ipc =     344.4762
gpu_tot_sim_cycle = 48282
gpu_tot_sim_insn = 16632000
gpu_tot_ipc =     344.4762
gpu_tot_issued_cta = 0
gpu_stall_dramfull = 42547
gpu_stall_icnt2sh    = 68778
gpu_total_sim_rate=339428

========= Core cache stats =========
L1I_cache:
	L1I_total_cache_accesses = 371520
	L1I_total_cache_misses = 1920
	L1I_total_cache_miss_rate = 0.0052
	L1I_total_cache_pending_hits = 0
	L1I_total_cache_reservation_fails = 0
L1D_cache:
	L1D_cache_core[0]: Access = 21513, Miss = 1284, Miss_rate = 0.060, Pending_hits = 5147, Reservation_fails = 2614
	L1D_cache_core[1]: Access = 21489, Miss = 1278, Miss_rate = 0.059, Pending_hits = 5140, Reservation_fails = 1796
	L1D_cache_core[2]: Access = 21492, Miss = 1281, Miss_rate = 0.060, Pending_hits = 5151, Reservation_fails = 2084
	L1D_cache_core[3]: Access = 21457, Miss = 1268, Miss_rate = 0.059, Pending_hits = 5125, Reservation_fails = 1525
	L1D_cache_core[4]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5144, Reservation_fails = 928
	L1D_cache_core[5]: Access = 21481, Miss = 1272, Miss_rate = 0.059, Pending_hits = 5142, Reservation_fails = 2045
	L1D_cache_core[6]: Access = 21484, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5143, Reservation_fails = 2898
	L1D_cache_core[7]: Access = 21505, Miss = 1277, Miss_rate = 0.059, Pending_hits = 5158, Reservation_fails = 3324
	L1D_cache_core[8]: Access = 21508, Miss = 1279, Miss_rate = 0.059, Pending_hits = 5149, Reservation_fails = 2750
	L1D_cache_core[9]: Access = 21505, Miss = 1287, Miss_rate = 0.060, Pending_hits = 5157, Reservation_fails = 3313
	L1D_cache_core[10]: Access = 21508, Miss = 1293, Miss_rate = 0.060, Pending_hits = 5164, Reservation_fails = 3276
	L1D_cache_core[11]: Access = 21505, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5148, Reservation_fails = 2931
	L1D_cache_core[12]: Access = 21508, Miss = 1292, Miss_rate = 0.060, Pending_hits = 5150, Reservation_fails = 2987
	L1D_cache_core[13]: Access = 21513, Miss = 1289, Miss_rate = 0.060, Pending_hits = 5145, Reservation_fails = 2983
	L1D_cache_core[14]: Access = 21516, Miss = 1285, Miss_rate = 0.060, Pending_hits = 5136, Reservation_fails = 3154
	L1D_total_cache_accesses = 322468
	L1D_total_cache_misses = 19228
	L1D_total_cache_miss_rate = 0.0596
	L1D_total_cache_pending_hits = 77199
	L1D_total_cache_reservation_fails = 38608
	L1D_cache_data_port_util = 0.078
	L1D_cache_fill_port_util = 0.006
L1C_cache:
	L1C_total_cache_accesses = 1920
	L1C_total_cache_misses = 480
	L1C_total_cache_miss_rate = 0.2500
	L1C_total_cache_pending_hits = 0
	L1C_total_cache_reservation_fails = 0
L1T_cache:
	L1T_total_cache_accesses = 0
	L1T_total_cache_misses = 0
	L1T_total_cache_pending_hits = 0
	L1T_total_cache_reservation_fails = 0

Total_core_cache_stats:
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 225461
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 77199
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 17972
	Total_core_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 13408
	Total_core_cache_stats_breakdown[CONST_ACC_R][HIT] = 1440
	Total_core_cache_stats_breakdown[CONST_ACC_R][MISS] = 480
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 580
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 1256
	Total_core_cache_stats_breakdown[GLOBAL_ACC_W][RESERVATION_FAIL] = 25200
	Total_core_cache_stats_breakdown[INST_ACC_R][HIT] = 369600
	Total_core_cache_stats_breakdown[INST_ACC_R][MISS] = 1920
Shader 0 warp_id issue ditsribution:
warp_id:
0, 1, 2, 3, 4, 5, 6, 7, 
distro:
1388, 1388, 1388, 1388, 1388, 1388, 1388, 1388, 
gpgpu_n_tot_thrd_icount = 21319680
gpgpu_n_tot_w_icount = 666240
gpgpu_n_stall_shd_mem = 216596
gpgpu_n_mem_read_local = 0
gpgpu_n_mem_write_local = 0
gpgpu_n_mem_read_global = 17972
gpgpu_n_mem_write_global = 1836
gpgpu_n_mem_texture = 0
gpgpu_n_mem_const = 60
gpgpu_n_load_insn  = 3600000
gpgpu_n_store_insn = 12000
gpgpu_n_shmem_insn = 0
gpgpu_n_tex_insn = 0
gpgpu_n_const_mem_insn = 0
gpgpu_n_param_mem_insn = 48000
gpgpu_n_shmem_bkconflict = 0
gpgpu_n_cache_bkconflict = 0
gpgpu_n_intrawarp_mshr_merge = 0
gpgpu_n_cmem_portconflict = 0
gpgpu_stall_shd_mem[c_mem][bk_conf] = 0
gpgpu_stall_shd_mem[c_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[c_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[c_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[t_mem][mshr_rc] = 0
gpgpu_stall_shd_mem[t_mem][icnt_rc] = 0
gpgpu_stall_shd_mem[t_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[s_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][bk_conf] = 0
gpgpu_stall_shd_mem[gl_mem][coal_stall] = 216596
gpgpu_stall_shd_mem[gl_mem][data_port_stall] = 0
gpgpu_stall_shd_mem[g_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[g_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[g_mem_st][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_ld][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpgpu_stall_shd_mem[l_mem_st][mshr_rc] = 0
gpgpu_stall_shd_mem[l_mem_st][icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_icnt_rc] = 0
gpgpu_stall_shd_mem[l_mem_ld][wb_rsrv_fail] = 0
gpu_reg_bank_conflict_stalls = 0
Warp Occupancy Distribution:
Stall:169500	W0_Idle:293067	W0_Scoreboard:4645929	W1:0	W2:0	W3:0	W4:166560	W5:0	W6:0	W7:0	W8:0	W9:0	W10:0	W11:0	W12:0	W13:0	W14:0	W15:0	W16:0	W17:0	W18:0	W19:0	W20:0	W21:0	W22:0	W23:0	W24:0	W25:0	W26:0	W27:0	W28:0	W29:0	W30:0	W31:0	W32:499680
traffic_breakdown_coretomem[CONST_ACC_R] = 480 {8:60,}
traffic_breakdown_coretomem[GLOBAL_ACC_R] = 143776 {8:17972,}
traffic_breakdown_coretomem[GLOBAL_ACC_W] = 117600 {40:1008,72:552,136:276,}
traffic_breakdown_coretomem[INST_ACC_R] = 1920 {8:240,}
traffic_breakdown_memtocore[CONST_ACC_R] = 4320 {72:60,}
traffic_breakdown_memtocore[GLOBAL_ACC_R] = 2444192 {136:17972,}
traffic_breakdown_memtocore[GLOBAL_ACC_W] = 14688 {8:1836,}
traffic_breakdown_memtocore[INST_ACC_R] = 32640 {136:240,}
maxmrqlatency = 12 
maxdqlatency = 0 
maxmflatency = 1356 
averagemflatency = 264 
max_icnt2mem_latency = 1205 
max_icnt2sh_latency = 48281 
mrq_lat_table:1080 	32 	4 	10 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
dq_lat_table:0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_table:0 	0 	0 	0 	0 	0 	0 	10720 	8178 	899 	71 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2mem_lat_table:0 	0 	0 	15440 	432 	764 	1403 	1061 	742 	253 	13 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
icnt2sh_lat_table:0 	0 	0 	3446 	13655 	892 	39 	0 	0 	0 	0 	0 	0 	0 	0 	1836 	0 	0 	0 	0 	0 	0 	0 	0 	
mf_lat_pw_table:0 	0 	0 	0 	0 	0 	0 	87 	6 	2 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	
maximum concurrent accesses to same row:
dram[0]:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
maximum service time to same row:
dram[0]:       502         0         0         0      1650      1707      1653      1501     12085     12831     36822     37578         0         0         0         0 
dram[1]:      1485         0         0         0      1763      1494      1765      1521     12204     13041     36903     37765         0         0         0         0 
dram[2]:         0         0         0         0      1503      1510      1526      1516     12325     13141     37102     37849         0         0         0         0 
dram[3]:         0         0         0         0      1499      1588      1525      1678     12515     13263     37144     38024         0         0         0         0 
dram[4]:         0         0         0         0      1616      1493      1484      1515     12585     13452     37341     38075         0         0         0         0 
dram[5]:         0         0         0         0      1497      1606      1519      1688     12779     13531     37456     38221         0         0         0         0 
average row accesses per activate:
dram[0]:  1.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 21.000000 19.000000      -nan      -nan      -nan      -nan 
dram[1]:  2.000000      -nan      -nan      -nan 10.000000 10.000000 32.000000 32.000000 32.000000 32.000000 19.000000 16.000000      -nan      -nan      -nan      -nan 
dram[2]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 18.000000      -nan      -nan      -nan      -nan 
dram[3]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 19.000000 17.000000      -nan      -nan      -nan      -nan 
dram[4]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 18.000000 21.000000      -nan      -nan      -nan      -nan 
dram[5]:      -nan      -nan      -nan      -nan 10.000000 12.000000 32.000000 32.000000 32.000000 32.000000 22.000000 16.000000      -nan      -nan      -nan      -nan 
average row locality = 1126/52 = 21.653847
number of total memory accesses made:
dram[0]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
total accesses: 0
min_bank_accesses = 0!
min_chip_accesses = 0!
number of total read accesses:
dram[0]:         3         0         0         0        10        10        32        32        32        32        16        16         0         0         0         0 
dram[1]:         2         0         0         0        10        10        32        32        32        32        16        15         0         0         0         0 
dram[2]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[3]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[4]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
dram[5]:         0         0         0         0        10        12        32        32        32        32        16        14         0         0         0         0 
total reads: 1084
min_bank_accesses = 0!
chip skew: 183/180 = 1.02
number of total write accesses:
dram[0]:         0         0         0         0         0         0         0         0         0         0         5         3         0         0         0         0 
dram[1]:         0         0         0         0         0         0         0         0         0         0         3         1         0         0         0         0 
dram[2]:         0         0         0         0         0         0         0         0         0         0         3         4         0         0         0         0 
dram[3]:         0         0         0         0         0         0         0         0         0         0         3         3         0         0         0         0 
dram[4]:         0         0         0         0         0         0         0         0         0         0         2         7         0         0         0         0 
dram[5]:         0         0         0         0         0         0         0         0         0         0         6         2         0         0         0         0 
total reads: 42
min_bank_accesses = 0!
chip skew: 9/4 = 2.25
average mf latency per bank:
dram[0]:       8505    none      none      none        6219      4879      4705      5583      4863      4800      4214      4411    none      none      none      none  
dram[1]:          0    none      none      none        4030      5132      5447      4339      4812      4857      4258      5293    none      none      none      none  
dram[2]:     none      none      none      none        4973      3789      4482      4576      4719      4883      4095      3891    none      none      none      none  
dram[3]:     none      none      none      none        3574      4782      4611      4852      4883      5030      4219      3979    none      none      none      none  
dram[4]:     none      none      none      none        6249      4014      5355      4358      4534      4853      4952      3543    none      none      none      none  
dram[5]:     none      none      none      none        4108      5204      4843      5156      4601      4623      3826      4504    none      none      none      none  
maximum mf latency per bank:
dram[0]:        486         0         0         0      1110      1069      1107      1325       592       781       876       721         0         0         0         0
dram[1]:          0         0         0         0      1097       809      1356       876       801       646       697       706         0         0         0         0
dram[2]:          0         0         0         0       882       679       907      1071       716       771       873       759         0         0         0         0
dram[3]:          0         0         0         0       377       983      1018      1068       742       781       690       817         0         0         0         0
dram[4]:          0         0         0         0      1081       527      1180       723       799       631       718       700         0         0         0         0
dram[5]:          0         0         0         0       543      1153      1006      1168       728       775       873       742         0         0         0         0

Number of Memory Banks Accessed per Memory Operation per Warp (from 0):
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
Average # of Memory Banks Accessed per Memory Operation per Warp=-nan

position of mrq chosen
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	

average position of mrq chosen = -nan
Memory Partition 0: 
Cache L2_bank_000:
MSHR contents

Cache L2_bank_001:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[0]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63344 n_act=11 n_pre=2 n_req=191 n_rd=366 n_write=8 bw_util=0.01174
n_activity=2903 dram_eff=0.2577
bk0: 6a 63659i bk1: 0a 63730i bk2: 0a 63734i bk3: 0a 63734i bk4: 20a 63671i bk5: 20a 63680i bk6: 64a 63581i bk7: 64a 63588i bk8: 64a 63586i bk9: 64a 63580i bk10: 32a 63625i bk11: 32a 63618i bk12: 0a 63727i bk13: 0a 63727i bk14: 0a 63730i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000784548
Memory Partition 1: 
Cache L2_bank_002:
MSHR contents

Cache L2_bank_003:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[1]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=9 n_pre=0 n_req=185 n_rd=362 n_write=5 bw_util=0.01152
n_activity=2703 dram_eff=0.2716
bk0: 4a 63711i bk1: 0a 63732i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 20a 63676i bk6: 64a 63580i bk7: 64a 63580i bk8: 64a 63587i bk9: 64a 63581i bk10: 32a 63624i bk11: 30a 63641i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63732i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000266746
Memory Partition 2: 
Cache L2_bank_004:
MSHR contents

Cache L2_bank_005:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[2]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=187 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2765 dram_eff=0.2662
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63679i bk5: 24a 63671i bk6: 64a 63581i bk7: 64a 63590i bk8: 64a 63588i bk9: 64a 63587i bk10: 32a 63623i bk11: 28a 63638i bk12: 0a 63728i bk13: 0a 63730i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000235364
Memory Partition 3: 
Cache L2_bank_006:
MSHR contents

Cache L2_bank_007:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[3]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63357 n_act=8 n_pre=0 n_req=186 n_rd=360 n_write=6 bw_util=0.01149
n_activity=2830 dram_eff=0.2587
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63732i bk4: 20a 63680i bk5: 24a 63670i bk6: 64a 63580i bk7: 64a 63572i bk8: 64a 63586i bk9: 64a 63589i bk10: 32a 63621i bk11: 28a 63638i bk12: 0a 63729i bk13: 0a 63729i bk14: 0a 63731i bk15: 0a 63731i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=2 avg=0.000580565
Memory Partition 4: 
Cache L2_bank_008:
MSHR contents

Cache L2_bank_009:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[4]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63353 n_act=8 n_pre=0 n_req=189 n_rd=360 n_write=10 bw_util=0.01161
n_activity=2771 dram_eff=0.2671
bk0: 0a 63731i bk1: 0a 63733i bk2: 0a 63733i bk3: 0a 63733i bk4: 20a 63679i bk5: 24a 63670i bk6: 64a 63587i bk7: 64a 63587i bk8: 64a 63588i bk9: 64a 63581i bk10: 32a 63633i bk11: 28a 63627i bk12: 0a 63728i bk13: 0a 63729i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.000313819
Memory Partition 5: 
Cache L2_bank_010:
MSHR contents

Cache L2_bank_011:
MSHR contents

In Dram Latency Queue (total = 0): 
DRAM[5]: 16 bks, busW=4 BL=8 CL=12, tRRD=2 tCCD=6, tRCD=12 tRAS=28 tRP=12 tRC=40
n_cmd=63731 n_nop=63355 n_act=8 n_pre=0 n_req=188 n_rd=360 n_write=8 bw_util=0.01155
n_activity=2789 dram_eff=0.2639
bk0: 0a 63731i bk1: 0a 63732i bk2: 0a 63732i bk3: 0a 63733i bk4: 20a 63678i bk5: 24a 63668i bk6: 64a 63584i bk7: 64a 63582i bk8: 64a 63589i bk9: 64a 63588i bk10: 32a 63598i bk11: 28a 63623i bk12: 0a 63729i bk13: 0a 63730i bk14: 0a 63730i bk15: 0a 63730i 
dram_util_bins: 0 0 0 0 0 0 0 0 0 0
dram_eff_bins: 0 0 0 0 0 0 0 0 0 0
mrqq: max=1 avg=0.00105129

========= L2 cache stats =========
L2_cache_bank[0]: Access = 1826, Miss = 93, Miss_rate = 0.051, Pending_hits = 248, Reservation_fails = 4441
L2_cache_bank[1]: Access = 1684, Miss = 90, Miss_rate = 0.053, Pending_hits = 231, Reservation_fails = 3387
L2_cache_bank[2]: Access = 1777, Miss = 92, Miss_rate = 0.052, Pending_hits = 239, Reservation_fails = 3478
L2_cache_bank[3]: Access = 1652, Miss = 89, Miss_rate = 0.054, Pending_hits = 227, Reservation_fails = 3558
L2_cache_bank[4]: Access = 1642, Miss = 90, Miss_rate = 0.055, Pending_hits = 236, Reservation_fails = 3430
L2_cache_bank[5]: Access = 1661, Miss = 90, Miss_rate = 0.054, Pending_hits = 232, Reservation_fails = 3472
L2_cache_bank[6]: Access = 1637, Miss = 90, Miss_rate = 0.055, Pending_hits = 237, Reservation_fails = 3884
L2_cache_bank[7]: Access = 1639, Miss = 90, Miss_rate = 0.055, Pending_hits = 250, Reservation_fails = 4069
L2_cache_bank[8]: Access = 1656, Miss = 90, Miss_rate = 0.054, Pending_hits = 250, Reservation_fails = 3783
L2_cache_bank[9]: Access = 1643, Miss = 90, Miss_rate = 0.055, Pending_hits = 241, Reservation_fails = 3742
L2_cache_bank[10]: Access = 1641, Miss = 90, Miss_rate = 0.055, Pending_hits = 239, Reservation_fails = 3801
L2_cache_bank[11]: Access = 1650, Miss = 90, Miss_rate = 0.055, Pending_hits = 243, Reservation_fails = 3849
L2_total_cache_accesses = 20108
L2_total_cache_misses = 1084
L2_total_cache_miss_rate = 0.0539
L2_total_cache_pending_hits = 2873
L2_total_cache_reservation_fails = 44894
L2_total_cache_breakdown:
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT] = 14077
	L2_cache_stats_breakdown[GLOBAL_ACC_R][HIT_RESERVED] = 2839
	L2_cache_stats_breakdown[GLOBAL_ACC_R][MISS] = 1056
	L2_cache_stats_breakdown[GLOBAL_ACC_R][RESERVATION_FAIL] = 44379
	L2_cache_stats_breakdown[CONST_ACC_R][HIT] = 56
	L2_cache_stats_breakdown[CONST_ACC_R][HIT_RESERVED] = 3
	L2_cache_stats_breakdown[CONST_ACC_R][MISS] = 1
	L2_cache_stats_breakdown[CONST_ACC_R][RESERVATION_FAIL] = 129
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT] = 1794
	L2_cache_stats_breakdown[GLOBAL_ACC_W][HIT_RESERVED] = 19
	L2_cache_stats_breakdown[GLOBAL_ACC_W][MISS] = 23
	L2_cache_stats_breakdown[INST_ACC_R][HIT] = 224
	L2_cache_stats_breakdown[INST_ACC_R][HIT_RESERVED] = 12
	L2_cache_stats_breakdown[INST_ACC_R][MISS] = 4
	L2_cache_stats_breakdown[INST_ACC_R][RESERVATION_FAIL] = 386
L2_cache_data_port_util = 0.104
L2_cache_fill_port_util = 0.007

icnt_total_pkts_mem_to_simt=93076
icnt_total_pkts_simt_to_mem=23324
LD_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ST_mem_lat_dist  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
----------------------------Interconnect-DETAILS--------------------------------
Class 0:
Packet latency average = 18.5053
	minimum = 6
	maximum = 729
Network latency average = 13.7655
	minimum = 6
	maximum = 426
Slowest packet = 623
Flit latency average = 11.0758
	minimum = 6
	maximum = 426
Slowest flit = 1683
Fragmentation average = 0
	minimum = 0
	maximum = 0
Injected packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Accepted packet rate average = 0.0308496
	minimum = 0.0275465 (at node 3)
	maximum = 0.0378195 (at node 15)
Injected flit rate average = 0.0892902
	minimum = 0.0318131 (at node 3)
	maximum = 0.173936 (at node 15)
Accepted flit rate average= 0.0892902
	minimum = 0.0392072 (at node 24)
	maximum = 0.129406 (at node 9)
Injected packet length average = 2.89437
Accepted packet length average = 2.89437
Total in-flight flits = 0 (0 measured)
====== Overall Traffic Statistics ======
====== Traffic class 0 ======
Packet latency average = 18.5053 (1 samples)
	minimum = 6 (1 samples)
	maximum = 729 (1 samples)
Network latency average = 13.7655 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Flit latency average = 11.0758 (1 samples)
	minimum = 6 (1 samples)
	maximum = 426 (1 samples)
Fragmentation average = 0 (1 samples)
	minimum = 0 (1 samples)
	maximum = 0 (1 samples)
Injected packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Accepted packet rate average = 0.0308496 (1 samples)
	minimum = 0.0275465 (1 samples)
	maximum = 0.0378195 (1 samples)
Injected flit rate average = 0.0892902 (1 samples)
	minimum = 0.0318131 (1 samples)
	maximum = 0.173936 (1 samples)
Accepted flit rate average = 0.0892902 (1 samples)
	minimum = 0.0392072 (1 samples)
	maximum = 0.129406 (1 samples)
Injected packet size average = 2.89437 (1 samples)
Accepted packet size average = 2.89437 (1 samples)
Hops average = 1 (1 samples)
----------------------------END-of-Interconnect-DETAILS-------------------------


gpgpu_simulation_time = 0 days, 0 hrs, 0 min, 49 sec (49 sec)
gpgpu_simulation_rate = 339428 (inst/sec)
gpgpu_simulation_rate = 985 (cycle/sec)
total time is 49381 ms
2021-06-14T15:50:38+08:00
